# Tutorial: distribution with outliers

In this tutorial you will learn:

• How to deal with outliers and non-gaussian distributions

In the “intrinsic distribution” tutorial, we fitted a gaussian distribution. But maybe we do not want to make such a strong assumption – we may not know the exact distribution. Perhaps it is skewed, has several sub-populations. This can be tested by trying out different models.

Here, we check compare three scenarios:

1. baseline model: a gaussian distribution

2. baseline model + outlier model

3. heavy-tailed distribution

Here is our data:

[1]:

import numpy as np
# velocity dispersions of dwarf galaxies by van Dokkum et al., Nature, 555, 629 https://arxiv.org/abs/1803.10237v1

values = np.array([15, 4, 2, 11, 1, -2, -1, -14, -39, -3])
values_lo = np.array([7, 16, 6, 3, 6, 5, 10, 6, 11, 13])
values_hi = np.array([7, 15, 8, 3, 6, 6, 10, 7, 14, 14])

n_data = len(values)


## Visualise the data

Lets plot the data first to see what is going on:

[2]:

%matplotlib inline
import matplotlib.pyplot as plt
np.random.seed(42)

samples = []
for i in range(n_data):
# draw normal random points
u = np.random.normal(size=400)
v = values[i] + np.where(u < 0, u * values_lo[i], u * values_hi[i])
samples.append(v)

samples = np.array(samples)

plt.figure()
# for each galaxy, plot alittle cloud with its own colors
plt.violinplot(samples.transpose(), vert=False, showextrema=False)

xlabel = 'Velocity [km/s]'
plt.xlabel(xlabel)
plt.xlim(-50, 50);


## Models

### Gaussian

[3]:

parameters = ['mean', 'scatter']

def prior_transform(cube):
params = cube.copy()
params[0] = cube[0] * 200 - 100 # mean from -100 to +100
params[1] = 10**(cube[1] * 2)   # scatter from 1 to 100
return params

import scipy.stats

def log_likelihood(params):
probs_samples = scipy.stats.norm(*params).pdf(samples)
probs_objects = probs_samples.mean(axis=1)
loglike = np.log(probs_objects + 1e-100).sum()
return loglike

import ultranest

sampler = ultranest.ReactiveNestedSampler(parameters, log_likelihood, prior_transform)
result = sampler.run()
sampler.print_results()

[ultranest] Sampling 400 live points from prior ...
[ultranest] Widening roots to 404 live points (have 400 already) ...
[ultranest] Sampling 4 live points from prior ...

[ultranest] Explored until L=-4e+01
[ultranest] Likelihood function evaluations: 5432
[ultranest]   logZ = -43.88 +- 0.0808
[ultranest] Effective samples strategy satisfied (ESS = 1676.6, need >400)
[ultranest] Posterior uncertainty strategy is satisfied (KL: 0.46+-0.07 nat, need <0.50 nat)
[ultranest] Evidency uncertainty strategy is satisfied (dlogz=0.08, need <0.5)
[ultranest]   logZ error budget: single: 0.09 bs:0.08 tail:0.01 total:0.08 required:<0.50
[ultranest] done iterating.

logZ = -43.882 +- 0.143
single instance: logZ = -43.882 +- 0.093
bootstrapped   : logZ = -43.879 +- 0.142
tail           : logZ = +- 0.010
insert order U test : converged: True correlation: inf iterations

mean                : -28.7 │ ▁      ▁▁▁▁▁▁▁▁▁▁▁▂▃▄▅▆▇▇▆▄▃▂▁▁▁▁▁▁▁▁ │17.4      0.1 +- 4.0
scatter             : 1.0   │▁▂▄▆▇▇▆▅▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁  ▁          ▁ │57.2      9.5 +- 4.4



### Gaussian + Outliers

[4]:

parameters2 = ['mean', 'scatter', 'w_outlier']

def prior_transform2(cube):
params = cube.copy()
params[0] = cube[0] * 200 - 100 # mean from -100 to +100
params[1] = 10**(cube[1] * 2)   # scatter from 1 to 100
params[2] = (cube[2] * 0.2)     # fraction of outliers: allow 0-20%
return params

def log_likelihood2(params):
mean, scatter, weight = params

# compute probability if following distribution
probs_samples = scipy.stats.norm(mean, scatter).pdf(samples)
# compute probability density, if outlier:
prob_outlier = scipy.stats.uniform(-100, 200).pdf(samples)

# combine the two populations with weight:
probs_samples = probs_samples * (1 - weight) + weight * prob_outlier

# average within each object (logical OR)
probs_objects = probs_samples.mean(axis=1)
# multiply across object (logical AND)
loglike = np.log(probs_objects + 1e-100).sum()
return loglike

sampler2 = ultranest.ReactiveNestedSampler(parameters2, log_likelihood2, prior_transform2)
result2 = sampler2.run()
sampler2.print_results()

[ultranest] Sampling 400 live points from prior ...

[ultranest] Explored until L=-4e+01
[ultranest] Likelihood function evaluations: 7615
[ultranest]   logZ = -44.3 +- 0.06916
[ultranest] Effective samples strategy satisfied (ESS = 1844.5, need >400)
[ultranest] Posterior uncertainty strategy is satisfied (KL: 0.46+-0.06 nat, need <0.50 nat)
[ultranest] Evidency uncertainty strategy is satisfied (dlogz=0.07, need <0.5)
[ultranest]   logZ error budget: single: 0.09 bs:0.07 tail:0.01 total:0.07 required:<0.50
[ultranest] done iterating.

logZ = -44.299 +- 0.129
single instance: logZ = -44.299 +- 0.090
bootstrapped   : logZ = -44.305 +- 0.129
tail           : logZ = +- 0.010
insert order U test : converged: True correlation: inf iterations

mean                : -44.6 │ ▁        ▁  ▁▁▁▁▁▁▂▂▃▅▇▇▇▄▂▁▁▁▁▁    ▁ │29.4      0.8 +- 4.5
scatter             : 1.0   │▃▃▄▅▆▆▇▇▆▅▄▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁  ▁ ▁ ▁ │39.0      8.7 +- 4.5
w_outlier           : 0.000 │▇▆▇▇▆▆▆▇▅▅▆▇▅▆▄▅▅▄▃▄▅▄▃▄▅▆▄▄▄▅▄▄▄▄▃▃▃▃▃│0.200     0.085 +- 0.057



### Heavy-tailed distribution

[5]:

parameters3 = ['mean', 'scatter', 'dof']

def prior_transform3(cube):
params = cube.copy()
params[0] = cube[0] * 200 - 100 # mean from -100 to +100
params[1] = 10**(cube[1] * 2)   # scatter from 1 to 100
params[2] = 10**(cube[2])       # degrees of freedom in student-t: 1 (heavy tails)..10 (gaussian)
return params

def log_likelihood3(params):
mean, scatter, dof = params
# compute student-t probability
probs_samples = scipy.stats.t(loc=mean, scale=scatter, df=dof).pdf(samples)
# average within each object (logical OR)
probs_objects = probs_samples.mean(axis=1)
# multiply across object (logical AND)
loglike = np.log(probs_objects + 1e-100).sum()
return loglike

sampler3 = ultranest.ReactiveNestedSampler(parameters3, log_likelihood3, prior_transform3)
sampler3.run()
sampler3.print_results()

[ultranest] Sampling 400 live points from prior ...

[ultranest] Explored until L=-4e+01
[ultranest] Likelihood function evaluations: 7362
[ultranest]   logZ = -44.16 +- 0.06484
[ultranest] Effective samples strategy satisfied (ESS = 1728.7, need >400)
[ultranest] Posterior uncertainty strategy is satisfied (KL: 0.46+-0.06 nat, need <0.50 nat)
[ultranest] Evidency uncertainty strategy is satisfied (dlogz=0.07, need <0.5)
[ultranest]   logZ error budget: single: 0.09 bs:0.06 tail:0.01 total:0.07 required:<0.50
[ultranest] done iterating.

logZ = -44.168 +- 0.122
single instance: logZ = -44.168 +- 0.092
bootstrapped   : logZ = -44.164 +- 0.121
tail           : logZ = +- 0.010
insert order U test : converged: True correlation: inf iterations

mean                : -26.8 │ ▁   ▁ ▁▁▁▁▁▁▁▁▁▂▂▃▃▄▆▇▇▇▇▆▄▃▂▂▁▁▁▁▁▁▁ │18.8      0.7 +- 4.3
scatter             : 1.0   │▃▄▆▇▇▇▆▅▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁     ▁ │41.2      7.5 +- 4.2
dof                 : 1.0   │▇▇▅▆▆▅▅▄▄▄▄▃▂▃▃▃▃▂▂▃▂▂▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂│10.0      4.1 +- 2.5



## Comparing the models

[6]:

plt.figure(figsize=(5, 12))
plt.xlabel(xlabel)

from ultranest.plot import PredictionBand
x = np.linspace(-50, 50, 400)

plt.subplot(3, 1, 1)
band = PredictionBand(x)

for params in sampler.results['samples'][:40]:
mean, scatter = params

band.line(color='k')
plt.legend(loc='upper right')
plt.title("Gaussian: lnZ=%.1f" % sampler.results['logz'])

plt.subplot(3, 1, 2)
band = PredictionBand(x)
bando = PredictionBand(x)

for params in sampler2.results['samples'][:40]:
mean, scatter, outlier_weight = params
band.add(scipy.stats.norm(mean, scatter).pdf(x) * (1 - outlier_weight))

band.line(color='k');
bando.line(color='orange');
plt.legend(loc='upper right')
plt.title("Gaussian + outliers: lnZ=%.1f" % sampler2.results['logz'])

plt.subplot(3, 1, 3)

for params in sampler3.results['samples'][:40]:
mean, scatter, df = params

band.line(color='k');
plt.title("Student-t: lnZ=%.1f" % sampler3.results['logz'])
plt.legend(loc='upper right');


The models look very similar in their predictions, and their lnZ values are also quite close. So I would not have a strong preference in any direction.

Lets compare the posterior distributions with getdist:

[7]:

from getdist import MCSamples, plots

samples_g = MCSamples(samples=sampler.results['samples'],
names=sampler.results['paramnames'],
label='Gaussian',
settings=dict(smooth_scale_2D=3), sampler='nested')
samples_o = MCSamples(samples=sampler2.results['samples'][:,:2],
names=sampler2.results['paramnames'][:2],
label='Gaussian + Outlier',
settings=dict(smooth_scale_2D=3), sampler='nested')
samples_t = MCSamples(samples=sampler3.results['samples'][:,:2],
names=sampler3.results['paramnames'][:2],
label='Student-t',
settings=dict(smooth_scale_2D=3), sampler='nested')

mcsamples = [samples_g, samples_o, samples_t]

g = plots.get_subplot_plotter(width_inch=8)
g.settings.num_plot_contours = 1
g.triangle_plot(mcsamples, filled=False, contour_colors=plt.cm.Set1.colors)

Removed no burn in
Removed no burn in
Removed no burn in