# Tutorial: distribution with outliers¶

In this tutorial you will learn:

• How to deal with outliers and non-gaussian distributions

In the “intrinsic distribution” tutorial, we fitted a gaussian distribution. But maybe we do not want to make such a strong assumption – we may not know the exact distribution. Perhaps it is skewed, has several sub-populations. This can be tested by trying out different models.

Here, we check compare three scenarios:

1. baseline model: a gaussian distribution

2. baseline model + outlier model

3. heavy-tailed distribution

Here is our data:

:

import numpy as np
# velocity dispersions of dwarf galaxies by van Dokkum et al., Nature, 555, 629 https://arxiv.org/abs/1803.10237v1

values = np.array([15, 4, 2, 11, 1, -2, -1, -14, -39, -3])
values_lo = np.array([7, 16, 6, 3, 6, 5, 10, 6, 11, 13])
values_hi = np.array([7, 15, 8, 3, 6, 6, 10, 7, 14, 14])

n_data = len(values)



## Visualise the data¶

Lets plot the data first to see what is going on:

:

%matplotlib inline
import matplotlib.pyplot as plt

samples = []
for i in range(n_data):
# draw normal random points
u = np.random.normal(size=400)
v = values[i] + np.where(u < 0, u * values_lo[i], u * values_hi[i])
samples.append(v)

samples = np.array(samples)

plt.figure()
# for each galaxy, plot alittle cloud with its own colors
plt.violinplot(samples.transpose(), vert=False, showextrema=False)

xlabel = 'Velocity [km/s]'
plt.xlabel(xlabel)
plt.xlim(-50, 50); ## Models¶

### Gaussian¶

:

parameters = ['mean', 'scatter']

def prior_transform(cube):
params = cube.copy()
params = cube * 200 - 100 # mean from -100 to +100
params = 10**(cube * 2)   # scatter from 1 to 100
return params

import scipy.stats

def log_likelihood(params):
probs_samples = scipy.stats.norm(*params).pdf(samples)
probs_objects = probs_samples.mean(axis=1)
loglike = np.log(probs_objects + 1e-100).sum()
return loglike

import ultranest

sampler = ultranest.ReactiveNestedSampler(parameters, log_likelihood, prior_transform)
result = sampler.run()
sampler.print_results()


[ultranest] Sampling 400 live points from prior ...

[ultranest] Explored until L=-4e+01
[ultranest] Likelihood function evaluations: 5284
[ultranest]   logZ = -44.01 +- 0.05714
[ultranest] Effective samples strategy satisfied (ESS = 1673.6, need >400)
[ultranest] Posterior uncertainty strategy is satisfied (KL: 0.45+-0.08 nat, need <0.50 nat)
[ultranest] Evidency uncertainty strategy is satisfied (dlogz=0.13, need <0.5)
[ultranest]   logZ error budget: single: 0.09 bs:0.06 tail:0.01 total:0.06 required:<0.50
[ultranest] done iterating.

logZ = -44.001 +- 0.087
single instance: logZ = -44.001 +- 0.090
bootstrapped   : logZ = -44.015 +- 0.087
tail           : logZ = +- 0.010

mean                0.5 +- 4.3
scatter             8.7 +- 4.7


### Gaussian + Outliers¶

:

parameters2 = ['mean', 'scatter', 'w_outlier']

def prior_transform2(cube):
params = cube.copy()
params = cube * 200 - 100 # mean from -100 to +100
params = 10**(cube * 2)   # scatter from 1 to 100
params = (cube * 0.2)     # fraction of outliers: allow 0-20%
return params

def log_likelihood2(params):
mean, scatter, weight = params

# compute probability if following distribution
probs_samples = scipy.stats.norm(mean, scatter).pdf(samples)
# compute probability density, if outlier:
prob_outlier = scipy.stats.uniform(-100, 200).pdf(samples)

# combine the two populations with weight:
probs_samples = probs_samples * (1 - weight) + weight * prob_outlier

# average within each object (logical OR)
probs_objects = probs_samples.mean(axis=1)
# multiply across object (logical AND)
loglike = np.log(probs_objects + 1e-100).sum()
return loglike

sampler2 = ultranest.ReactiveNestedSampler(parameters2, log_likelihood2, prior_transform2)
result2 = sampler2.run()
sampler2.print_results()

[ultranest] Sampling 400 live points from prior ...

[ultranest] Explored until L=-4e+01
[ultranest] Likelihood function evaluations: 7411
[ultranest]   logZ = -44.17 +- 0.05804
[ultranest] Effective samples strategy satisfied (ESS = 1751.6, need >400)
[ultranest] Posterior uncertainty strategy is satisfied (KL: 0.45+-0.07 nat, need <0.50 nat)
[ultranest] Evidency uncertainty strategy is satisfied (dlogz=0.13, need <0.5)
[ultranest]   logZ error budget: single: 0.09 bs:0.06 tail:0.01 total:0.06 required:<0.50
[ultranest] done iterating.

logZ = -44.164 +- 0.126
single instance: logZ = -44.164 +- 0.087
bootstrapped   : logZ = -44.170 +- 0.126
tail           : logZ = +- 0.010

mean                1.2 +- 4.4
scatter             8.0 +- 4.7
w_outlier           0.088 +- 0.057


### Heavy-tailed distribution¶

:

parameters3 = ['mean', 'scatter', 'dof']

def prior_transform3(cube):
params = cube.copy()
params = cube * 200 - 100 # mean from -100 to +100
params = 10**(cube * 2)   # scatter from 1 to 100
params = 10**(cube)       # degrees of freedom in student-t: 1 (heavy tails)..10 (gaussian)
return params

def log_likelihood3(params):
mean, scatter, dof = params
# compute student-t probability
probs_samples = scipy.stats.t(loc=mean, scale=scatter, df=dof).pdf(samples)
# average within each object (logical OR)
probs_objects = probs_samples.mean(axis=1)
# multiply across object (logical AND)
loglike = np.log(probs_objects + 1e-100).sum()
return loglike

sampler3 = ultranest.ReactiveNestedSampler(parameters3, log_likelihood3, prior_transform3)
sampler3.run()
sampler3.print_results()

[ultranest] Sampling 400 live points from prior ...

[ultranest] Explored until L=-4e+01
[ultranest] Likelihood function evaluations: 7171
[ultranest]   logZ = -43.98 +- 0.05878
[ultranest] Effective samples strategy satisfied (ESS = 1651.5, need >400)
[ultranest] Posterior uncertainty strategy is satisfied (KL: 0.46+-0.06 nat, need <0.50 nat)
[ultranest] Evidency uncertainty strategy is satisfied (dlogz=0.12, need <0.5)
[ultranest]   logZ error budget: single: 0.09 bs:0.06 tail:0.01 total:0.06 required:<0.50
[ultranest] done iterating.

logZ = -43.976 +- 0.122
single instance: logZ = -43.976 +- 0.088
bootstrapped   : logZ = -43.975 +- 0.122
tail           : logZ = +- 0.010

mean                0.9 +- 4.2
scatter             6.9 +- 4.2
dof                 4.0 +- 2.5


## Comparing the models¶

:

plt.figure(figsize=(5, 12))
plt.xlabel(xlabel)

from ultranest.plot import PredictionBand
x = np.linspace(-50, 50, 400)

plt.subplot(3, 1, 1)
band = PredictionBand(x)

for params in sampler.results['samples'][:40]:
mean, scatter = params

band.line(color='k')
plt.legend(loc='upper right')
plt.title("Gaussian: lnZ=%.1f" % sampler.results['logz'])

plt.subplot(3, 1, 2)
band = PredictionBand(x)
bando = PredictionBand(x)

for params in sampler2.results['samples'][:40]:
mean, scatter, outlier_weight = params
band.add(scipy.stats.norm(mean, scatter).pdf(x) * (1 - outlier_weight))

band.line(color='k');
bando.line(color='orange');
plt.legend(loc='upper right')
plt.title("Gaussian + outliers: lnZ=%.1f" % sampler2.results['logz'])

plt.subplot(3, 1, 3)

for params in sampler3.results['samples'][:40]:
mean, scatter, df = params

band.line(color='k');
plt.title("Student-t: lnZ=%.1f" % sampler3.results['logz'])
plt.legend(loc='upper right'); The models look very similar in their predictions, and their lnZ values are also quite close. So I would not have a strong preference in any direction.

Lets compare the posterior distributions with getdist:

:

from getdist import MCSamples, plots

samples_g = MCSamples(samples=sampler.results['samples'],
names=sampler.results['paramnames'],
label='Gaussian',
settings=dict(smooth_scale_2D=3), sampler='nested')
samples_o = MCSamples(samples=sampler2.results['samples'][:,:2],
names=sampler2.results['paramnames'][:2],
label='Gaussian + Outlier',
settings=dict(smooth_scale_2D=3), sampler='nested')
samples_t = MCSamples(samples=sampler3.results['samples'][:,:2],
names=sampler3.results['paramnames'][:2],
label='Student-t',
settings=dict(smooth_scale_2D=3), sampler='nested')

mcsamples = [samples_g, samples_o, samples_t]

g = plots.get_subplot_plotter(width_inch=8)
g.settings.num_plot_contours = 1
g.triangle_plot(mcsamples, filled=False, contour_colors=plt.cm.Set1.colors)


Removed no burn in
Removed no burn in
Removed no burn in 