import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st

pd.set_option('display.float_format', '{:g}'.format)
sns.set()

Dataset

K = 16
L = 3
M = ((K + L) * 47) % 11 + 1
pd.DataFrame([K, L, M], index=["K", "L", "M"], columns=[""])
K 16
L 3
M 3
M Dataset Description
3 case0201 Humerus length according to sparrow survival

Introduction

In the initial part, we load the data file and split the observed variable into the two respective observed groups. We briefly describe the data and the problem in the study. For each group separately, we estimate the mean, variance and median of the respective distribution.

data = pd.read_csv("case0201.csv")

S = data.loc[data["Status"] == "Survived", "Humerus"]
P = data.loc[data["Status"] == "Perished", "Humerus"]

Objective

After an unusually strong winter storm, some sparrows survived and some died. We are investigating whether the birds' deaths may have been caused by the small length of the arm bone.

Data description

The data in this dataset relate to the arm bone length of 59 adult male sparrows, of which 24 died and 35 survived.

Regressor Description Values
Humerus Length of arm bone of adult male sparrows (in hundredths of an inch) 733, 736, ...
Status Variable indicating whether the sparrow died or survived survived / perished

Point estimates

The mean value is estimated by the sample mean

$$ \bar{X}_n = \frac{1}{n} \sum_{i=1}^{n} X_i $$

the variance by the sample variance

$$ s_n^2 = \frac{1}{n-1} \sum_{i=1}^{n} \left( X_i - \bar{X}_n \right)^2 $$

and the standard deviation by the sample standard deviation

$$ s_n = \sqrt{s_n^2} $$
S_n = S.shape[0]
S_mean = np.mean(S)
S_var = np.var(S, ddof=1)
S_std = np.sqrt(S_var)
S_med = np.median(S)

P_n = P.shape[0]
P_mean = np.mean(P)
P_var = np.var(P, ddof=1)
P_std = np.sqrt(P_var)
P_med = np.median(P)


pd.DataFrame({
    "Frequency": [S_n, P_n],
    "Sample mean": [S_mean, P_mean],
    "Sample variance": [S_var, P_var],
    "Sample standard deviation": [S_std, P_std],
    "Median": [S_med, P_med],
}, index=["Survived", "Perished"]).round(4).T
Survived Perished
Frequency 35 24
Sample mean 738 727.917
Sample variance 393.588 554.254
Sample standard deviation 19.8391 23.5426
Median 736 733.5

Distribution function estimate

For each group separately, we estimate the density and distribution function using the histogram and the empirical distribution function.

Histogram

We estimate the density with a histogram over a given range of data using the selected $k$ bins.

$$ \hat{f}(x) = \frac{\text{number of observations in the same bin as } x}{\text{number of all observations}} $$

Empirical distribution function

We estimate the distribution function using the empirical distribution function.

$$ \hat{F}(x) = \frac{1}{n} \sum_{i=1}^{n} \mathbb{1}_{\{X \le x\}} $$
def process_bins(vals, bins, normalize=True):
    density_est = []
    my_bins = []
    
    for i in range(len(vals)):
        if i == len(vals) - 1:
            bin = f"[{bins[i]:.0f}, {bins[i + 1]:.0f}]"
        else:
            bin = f"[{bins[i]:.0f}, {bins[i + 1]:.0f})"
        my_bins.append(bin)

    for val in vals:
        if normalize:
            val = val / np.sum(vals)
        density_est.append(val)

    return density_est, my_bins

def histogram(data, bin_count):
    fig, axes = plt.subplots(1, 2, figsize=(10, 5), layout="constrained")
    
    counts, bins = np.histogram(data, bins=bin_count)
    bins = bins.round()
    
    ax = axes[0]
    sns.kdeplot(data, ax=ax, linewidth=1, color="black", alpha=0.3)
    sns.histplot(data, bins=bins, stat="density", ax=ax)
    sns.rugplot(data, ax=ax, color="black")
    ax.set_title("Histogram")
    
    ax = axes[1]
    sns.ecdfplot(data, stat="proportion", ax=ax, linewidth=1.5)
    sns.kdeplot(data, cumulative=True, ax=ax, linewidth=1, color="black", alpha=0.3)
    sns.rugplot(data, ax=ax, color="black", height=0.015)
    ax.set_title("Empirical distribution function")

    plt.show()

    density_est, my_bins =  process_bins(counts, bins)
    density_table = pd.DataFrame([my_bins, np.round(density_est, 4)], index=["Bin", "Esmtimation $f(x)$"], columns=[""]*len(my_bins))

    ecdf = st.ecdf(data)
    distrib_est, my_bins =  process_bins(ecdf.cdf.probabilities[:-1], ecdf.cdf.quantiles, normalize=False)
    distrib_table = pd.DataFrame([my_bins, np.round(distrib_est, 4)], index=["Bin", "Estimation $F(x)$"], columns=[""]*len(my_bins))
    
    return density_table, distrib_table

Survived

density_table, distrib_table = histogram(S, 9)

Density estimation

From the histogram we estimate the density

$$ \hat{f}_{\text{survived}}(x) = \begin{cases} \text{the corresponding value in the table,} &\quad x \in \left[ 687, 780 \right] \\ 0 \text{,} &\quad x \notin \left[ 687, 780 \right] \\ \end{cases} $$
density_table
Bin [687, 697) [697, 708) [708, 718) [718, 728) [728, 739) [739, 749) [749, 759) [759, 770) [770, 780]
Esmtimation $f(x)$ 0.0286 0.0286 0.0571 0.2 0.2 0.1714 0.1714 0.0857 0.0571

Estimation of the distribution function

From the empirical distribution function we estimate the distribution function

$$ \hat{F}_{\text{survived}}(x) = \begin{cases} \text{0,} &\quad x \lt 687\\ \text{the corresponding value in the table,} &\quad x \in \left[ 687, 780 \right] \\ \text{1,} &\quad x \gt 780\\ \end{cases} $$
display(distrib_table.iloc[:, :9])
display(distrib_table.iloc[:, 9:18])
display(distrib_table.iloc[:, 18:])
Bin [687, 703) [703, 709) [709, 715) [715, 721) [721, 723) [723, 726) [726, 728) [728, 729) [729, 730)
Estimation $F(x)$ 0.0286 0.0571 0.0857 0.1143 0.1429 0.2 0.2286 0.3143 0.3429
Bin [730, 733) [733, 735) [735, 736) [736, 739) [739, 741) [741, 743) [743, 749) [749, 751) [751, 752)
Estimation $F(x)$ 0.4 0.4571 0.4857 0.5143 0.5429 0.6571 0.6857 0.7143 0.7429
Bin [752, 755) [755, 756) [756, 766) [766, 767) [767, 769) [769, 770) [770, 780]
Estimation $F(x)$ 0.8 0.8286 0.8571 0.8857 0.9143 0.9429 0.9714

Perished

density_table, distrib_table = histogram(P, 6)

Density estimation

From the histogram we estimate the density

$$ \hat{f}_{\text{perished}}(x) = \begin{cases} \text{the corresponding value in the table,} &\quad x \in \left[ 659, 765 \right] \\ 0 \text{,} &\quad x \notin \left[ 659, 765 \right] \\ \end{cases} $$
density_table
Bin [659, 677) [677, 694) [694, 712) [712, 730) [730, 747) [747, 765]
Esmtimation $f(x)$ 0.0417 0.0417 0.125 0.25 0.375 0.1667

Estimation of the distribution function

From the empirical distribution function we estimate the distribution function

$$ \hat{F}_{\text{perished}}(x) = \begin{cases} \text{0,} &\quad x \lt 659\\ \text{the corresponding value in the table,} &\quad x \in \left[ 659, 765 \right] \\ \text{1,} &\quad x \gt 765\\ \end{cases} $$
display(distrib_table.iloc[:, :7])
display(distrib_table.iloc[:, 7:14])
display(distrib_table.iloc[:, 14:21])
Bin [659, 689) [689, 702) [702, 703) [703, 709) [709, 713) [713, 720) [720, 726)
Estimation $F(x)$ 0.0417 0.0833 0.125 0.1667 0.2083 0.25 0.3333
Bin [726, 729) [729, 731) [731, 736) [736, 737) [737, 738) [738, 739) [739, 743)
Estimation $F(x)$ 0.4167 0.4583 0.5 0.5417 0.5833 0.6667 0.7083
Bin [743, 744) [744, 745) [745, 752) [752, 754) [754, 765]
Estimation $F(x)$ 0.75 0.7917 0.8333 0.9167 0.9583

Statistical model

For each group separately, we find the closest distribution (estimate the parameters of the normal, exponential, and uniform distributions) and plot the corresponding densities with the estimated parameters on histogram plots. We also discuss which distribution fits the observed data most accurately.

Parameter estimation

The parameters of the exponential and uniform distribution are estimated by the method of moments.

We estimate the moments by sample moments.

$$ \widehat{EX^k} = m_k = \frac{1}{n} \sum_{i=1}^{n} X_i^k $$
S_m1 = 1 / S_n * np.sum(S)
S_m2 = 1 / S_n * np.dot(S, S)

P_m1 = 1 / P_n * np.sum(P)
P_m2 = 1 / P_n * np.dot(P, P)

pd.DataFrame({
    "Survived": [S_m1, S_m2],
    "Perished": [P_m1, P_m2],
}, index=["Sample moment $m_1$", "Sample moment $m_2$"]).round(4)
Survived Perished
Sample moment $m_1$ 738 727.917
Sample moment $m_2$ 545026 530394

We now calculate the unbiased and consistent estimates of the parameters of the normal distribution from the previous tasks.

Normal distribution $\mathcal{N}(\mu, \sigma^2)$

$ f(x) = \frac{1}{\sqrt{2\pi \sigma^2}} e^{-\frac{\left(x - \mu\right)^2}{2 \sigma^2}} \quad \text{for } x \in \mathbb{R} $

$$\hat{\mu} = \bar{X}_n$$$$\hat{\sigma}^2 = s_n^2$$

We express the parameters of the remaining two statistical models as functions of moments.

Exponential distribution $\text{Exp}(\lambda, L)$

$ f(x) = \begin{cases} \lambda e^{-\lambda \left(x - L\right)} & \text{for } x \geq 0 \\ 0 &\text{otherwise} \end{cases} $

$$\hat{\lambda} = \frac{1}{\sqrt{m_2 - m_1^2}}$$$$\hat{L} = m_1 - \sqrt{m_2 - m_1^2}$$

Uniform distribution $\mathcal{U}(a, b)$

$ f(x) = \begin{cases} \frac{1}{b-a} &\quad \text{for } a \leq x \leq b \\ 0 &\quad \text{otherwise} \end{cases} $

$$\hat{a} = m_1 - \sqrt{3 \cdot \left( m_2 - m_1^2 \right)}$$$$\hat{b} = m_1 + \sqrt{3 \cdot \left( m_2 - m_1^2 \right)}$$
def add_est(data, m1, m2, bin_count, mean, std, title, ax):
    X = np.linspace(data.min()-10, data.max()+10, 1000)
    counts, bins = np.histogram(data, bins=bin_count)
    bins = bins.round()
    
    lamb_hat = 1 / np.sqrt(m2 - m1*m1)
    l_hat = m1 - np.sqrt(m2 - m1*m1)
    
    a_hat = m1 - np.sqrt(3 * (m2 - m1*m1))
    b_hat = m1 + np.sqrt(3 * (m2 - m1*m1))

    Y_norm = st.norm.pdf(X, loc=mean, scale=std)
    
    X_exp = np.linspace(l_hat, data.max()+10, 1000)
    Y_exp = st.expon.pdf(X_exp, loc=l_hat, scale=1/lamb_hat)
    
    sns.histplot(data, bins=bins, stat="density", ax=ax, color="grey", alpha=0.3)
    
    sns.lineplot(x=X, y=Y_norm, ax=ax, label=f"N ( {mean:.0f}, {std*std:.0f} )", linewidth=2.5, color="royalblue")
    
    sns.lineplot(x=X_exp, y=Y_exp, ax=ax, label=f"Exp ( {lamb_hat:.4f}, {l_hat:.0f} )", linewidth=2.5, color="darkorange")
    sns.lineplot(x=[X.min(), l_hat], y=0.0003, ax=ax, linewidth=2.5, color="darkorange")
    
    sns.lineplot(x=[a_hat, b_hat], y=1 / (b_hat - a_hat), ax=ax, label=f"U ( {a_hat:.0f}, {b_hat:.0f} )", linewidth=2.5, color="green")
    sns.lineplot(x=[X.min(), a_hat], y=0.0003, ax=ax, linewidth=2.5, color="green")
    sns.lineplot(x=[b_hat, X.max()], y=0.0003, ax=ax, linewidth=2.5, color="green")
    
    ax.legend(loc="upper right")
    ax.set_title(title)

fig, axes = plt.subplots(1, 2, figsize=(10, 5), layout="constrained", sharey=True)
add_est(S, S_m1, S_m2, 9, S_mean, S_std, "Survived", axes[0])
add_est(P, P_m1, P_m2, 6, P_mean, P_std, "Perished", axes[1])

Conclusion

In both groups of sparrows, it is clear that a normal distribution best fits the observed data.


Simulation

For each group separately, we generate a random sample of 100 values from the distribution with the parameters estimated in the previous section. We compare the histogram of simulated values with the observed data.

def plot_norm(data, mean, std, bin_count):
    fig, axes = plt.subplots(1, 2, figsize=(10, 5), layout="constrained", sharey=True)

    counts, bins = np.histogram(data, bins=bin_count)
    bins = bins.round()
    w = bins[1] - bins[0]
    
    ax = axes[0]
    sns.histplot(data, bins=bins, stat="density", ax=ax)
    ax.set_title("Original data")

    X = np.linspace(data.min()-10, data.max()+10, 1000)
    while X.max() > bins.max():
        bins = np.append(bins, [bins.max() + w])
    while X.min() < bins.min():
        bins = np.insert(bins, 0, bins.min() - w)
    
    ax = axes[1]
    new_data = st.norm.rvs(mean, std, 100, random_state=42)
    sns.histplot(new_data, bins=bins, stat="density", ax=ax, color="tomato")
    ax.set_title("Simulated data")
    ax.set_xlabel("Humerus")

Survived

plot_norm(S, S_mean, S_std, 9)

Perished

plot_norm(P, P_mean, P_std, 6)

Conclusion

Histograms are similar in both cases.


Confidence interval

We now calculate the two-sided $95\%$ confidence interval for the mean for each group separately.

The two-sided $100 \cdot (1 - \alpha)\%$ confidence interval for $\mu$ with unknown variance is calculated as

$$ \left( \bar{X}_n - t_{\alpha / 2, n-1} \frac{s_n}{\sqrt{n}}, \bar{X}_n + t_{\alpha / 2, n-1} \frac{s_n}{\sqrt{n}} \right) $$
def interval(data, n, mean, std, alpha):
    t_a_half = st.t.isf(alpha / 2, n - 1)
    delta = t_a_half * std / np.sqrt(n)
    return np.array([mean - delta, mean + delta])

S_ci = interval(S, S_n, S_mean, S_std, 0.05).round(4)
P_ci = interval(P, P_n, P_mean, P_std, 0.05).round(4)

pd.DataFrame({
    "Survived": [f"( {S_ci[0]} , {S_ci[1]} )"],
    "Perished": [f"( {P_ci[0]} , {P_ci[1]} )"],
}, index=["Two-sided 95% confidence interval for $\\mu$"]).T
Two-sided 95% confidence interval for $\mu$
Survived ( 731.185 , 744.815 )
Perished ( 717.9755 , 737.8578 )

Mean value test

For each group separately, we test at the $5\%$ significance level the hypothesis that the mean is equal to K (the task parameter) against the two-sided alternative.

pd.DataFrame([K], index=["K"], columns=[""])
K 16

Survived

Test

At the significance level $\alpha = 5\%$, we test the hypothesis

$$ H_0 : \quad \mu = 16 $$$$ H_A : \quad \mu \not 16 $$

The corresponding confidence interval is

$$ (L, U) = (731.19 , 744.81)$$

The tested value $\mu_0 = 16$ does not lie in the interval.

Test conclusion

At the $5\%$ significance level, we reject the null hypothesis in favor of the alternative that the true mean arm bone length of sparrows that survived the winter is not $0.16 \text{in} \approx 4 \text{mm}$.

Perished

Test

At the significance level $\alpha = 5\%$, test the hypothesis

$$ H_0 : \quad \mu = 16 $$$$ H_A : \quad \mu \not 16 $$

The corresponding confidence interval is

$$ (L, U) = (717.98 , 737.86)$$

The tested value $\mu_0 = 16$ does not lie in the interval.

Test conclusion

At the $5\%$ significance level, we reject the null hypothesis in favor of the alternative that the true mean arm bone length of sparrows that did not survive the winter is not $0.16 \text{in} \approx 4 \text{mm}$.


Test for equality of means

At the $5\%$ significance level, we test whether the observed groups have the same mean.

Two-sample t-test

pd.DataFrame([np.round(S_std / P_std, 4)], index=["$s_{\text{survived}} / s_{\text{perished}}$"], columns=[""])
$s_{\text{survived}} / s_{\text{perished}}$ 0.8427

We can assume that the variance of both groups of sparrows is the same/similar:

$$ \frac{s_1}{s_2} \approx 0.8427 $$$$ \frac{1}{2} \lt \frac{s_1}{s_2} \lt 2 $$

For comparison, we use a two-sample t-test of the normal distribution.

The test is performed by comparing the test statistic $T$ with the corresponding critical value of the Student's $t$-distribution:

$$ T = \frac{\bar{X}_1 - \bar{X}_2}{s_{12}} \cdot \sqrt{ \frac{n_1 n_2}{n_1 + n_2}} $$

where

$$ s_{12} = \sqrt{ \frac{ \left( n_1 - 1 \right) s_1^2 + \left( n_2 - 1 \right) s_2^2 }{ n_1 + n_2 - 2 } } $$
SP_std = np.sqrt(( (S_n - 1) * S_var + (P_n - 1) * P_var )/( S_n + P_n - 2 ))
T_stat = (S_mean - P_mean) / SP_std * np.sqrt((S_n * P_n)/(S_n + P_n))
T_crit_05_57 = st.t.isf(0.05, 57)
T_crit_025_57 = st.t.isf(0.025, 57)

pd.DataFrame(np.array([T_stat, T_crit_05_57]).round(4),
             index=["$T$", "$t_{0.05, 57}$"], columns=["Value"])
Value
$T$ 1.777
$t_{0.05, 57}$ 1.672

Návrh

At the significance level $\alpha = 5\%$, test the hypothesis

$$ H_0 \colon \quad \mu_{\text{survived}} = \mu_{\text{perished}} $$$$ H_A \colon \quad \mu_{\text{survived}} > \mu_{\text{perished}} $$

The corresponding test statistic is $T = 1.777$.

Because

$$ 1.777 = T \gt t_{\alpha, n_1 + n_2 - 2} = t_{0.05, 57} = 1.672 $$

we reject the hypothesis of agreement at the $5\%$ significance level.


Conclusion

Based on our data, we can prove a significant difference in mean arm bone length between sparrows that survived the winter and sparrows that died during the winter. Those that survived the winter storm had a longer humerus on average.


Alternative conclusion

pd.DataFrame(np.array([T_stat, T_crit_025_57]).round(4),
             index=["$T$", "$t_{0.025, 57}$"], columns=["Value"])
Value
$T$ 1.777
$t_{0.025, 57}$ 2.0025

If we examined only the equality/inequality of means (two-sided)

$$ H_0 \colon \quad \mu_{\text{survived}} = \mu_{\text{perished}} $$$$ H_A \colon \quad \mu_{\text{survived}} > \mu_{\text{perished}} $$$$ 1.777 = |T| \lt t_{\alpha / 2, n_1 + n_2 − 2} = t_{0.025, 57} = 2.0025 $$

the $p$-value for a two-sided test would be greater than $5\%$, in which case $H_A$ would be statistically insignificant.

p_greater = st.mstats.ttest_ind(S, P, equal_var=True, alternative="greater")
p_two_sided = st.mstats.ttest_ind(S, P, equal_var=True, alternative="two-sided")
pd.DataFrame(np.array([p_greater, p_two_sided]).round(4),
             index=["One-sided test", "Two-sided test"], columns=["T statistic", "p-value"])
T statistic p-value
One-sided test 1.777 0.0405
Two-sided test 1.777 0.0809