import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st

pd.set_option('display.float_format', '{:g}'.format)
sns.set()

Dataset¶

K = 16
L = 3
M = ((K + L) * 47) % 11 + 1
pd.DataFrame([K, L, M], index=["K", "L", "M"], columns=[""])


K	16
L	3
M	3

M	Dataset	Description
3	case0201	Humerus length according to sparrow survival

Introduction¶

In the initial part, we load the data file and split the observed variable into the two respective observed groups. We briefly describe the data and the problem in the study. For each group separately, we estimate the mean, variance and median of the respective distribution.

data = pd.read_csv("case0201.csv")

S = data.loc[data["Status"] == "Survived", "Humerus"]
P = data.loc[data["Status"] == "Perished", "Humerus"]

Objective¶

After an unusually strong winter storm, some sparrows survived and some died. We are investigating whether the birds' deaths may have been caused by the small length of the arm bone.

Data description¶

The data in this dataset relate to the arm bone length of 59 adult male sparrows, of which 24 died and 35 survived.

Regressor	Description	Values
Humerus	Length of arm bone of adult male sparrows (in hundredths of an inch)	733, 736, ...
Status	Variable indicating whether the sparrow died or survived	survived / perished

Point estimates¶

The mean value is estimated by the sample mean

$$ \bar{X}_n = \frac{1}{n} \sum_{i=1}^{n} X_i $$

the variance by the sample variance

$$ s_n^2 = \frac{1}{n-1} \sum_{i=1}^{n} \left( X_i - \bar{X}_n \right)^2 $$

and the standard deviation by the sample standard deviation

$$ s_n = \sqrt{s_n^2} $$

S_n = S.shape[0]
S_mean = np.mean(S)
S_var = np.var(S, ddof=1)
S_std = np.sqrt(S_var)
S_med = np.median(S)

P_n = P.shape[0]
P_mean = np.mean(P)
P_var = np.var(P, ddof=1)
P_std = np.sqrt(P_var)
P_med = np.median(P)


pd.DataFrame({
    "Frequency": [S_n, P_n],
    "Sample mean": [S_mean, P_mean],
    "Sample variance": [S_var, P_var],
    "Sample standard deviation": [S_std, P_std],
    "Median": [S_med, P_med],
}, index=["Survived", "Perished"]).round(4).T

	Survived	Perished
Frequency	35	24
Sample mean	738	727.917
Sample variance	393.588	554.254
Sample standard deviation	19.8391	23.5426
Median	736	733.5

Distribution function estimate¶

For each group separately, we estimate the density and distribution function using the histogram and the empirical distribution function.

Histogram¶

We estimate the density with a histogram over a given range of data using the selected $k$ bins.

$$ \hat{f}(x) = \frac{\text{number of observations in the same bin as } x}{\text{number of all observations}} $$

Empirical distribution function¶

We estimate the distribution function using the empirical distribution function.

$$ \hat{F}(x) = \frac{1}{n} \sum_{i=1}^{n} \mathbb{1}_{\{X \le x\}} $$

def process_bins(vals, bins, normalize=True):
    density_est = []
    my_bins = []
    
    for i in range(len(vals)):
        if i == len(vals) - 1:
            bin = f"[{bins[i]:.0f}, {bins[i + 1]:.0f}]"
        else:
            bin = f"[{bins[i]:.0f}, {bins[i + 1]:.0f})"
        my_bins.append(bin)

    for val in vals:
        if normalize:
            val = val / np.sum(vals)
        density_est.append(val)

    return density_est, my_bins

def histogram(data, bin_count):
    fig, axes = plt.subplots(1, 2, figsize=(10, 5), layout="constrained")
    
    counts, bins = np.histogram(data, bins=bin_count)
    bins = bins.round()
    
    ax = axes[0]
    sns.kdeplot(data, ax=ax, linewidth=1, color="black", alpha=0.3)
    sns.histplot(data, bins=bins, stat="density", ax=ax)
    sns.rugplot(data, ax=ax, color="black")
    ax.set_title("Histogram")
    
    ax = axes[1]
    sns.ecdfplot(data, stat="proportion", ax=ax, linewidth=1.5)
    sns.kdeplot(data, cumulative=True, ax=ax, linewidth=1, color="black", alpha=0.3)
    sns.rugplot(data, ax=ax, color="black", height=0.015)
    ax.set_title("Empirical distribution function")

    plt.show()

    density_est, my_bins =  process_bins(counts, bins)
    density_table = pd.DataFrame([my_bins, np.round(density_est, 4)], index=["Bin", "Esmtimation $f(x)$"], columns=[""]*len(my_bins))

    ecdf = st.ecdf(data)
    distrib_est, my_bins =  process_bins(ecdf.cdf.probabilities[:-1], ecdf.cdf.quantiles, normalize=False)
    distrib_table = pd.DataFrame([my_bins, np.round(distrib_est, 4)], index=["Bin", "Estimation $F(x)$"], columns=[""]*len(my_bins))
    
    return density_table, distrib_table

Survived¶

density_table, distrib_table = histogram(S, 9)

Density estimation¶

From the histogram we estimate the density

$$ \hat{f}_{\text{survived}}(x) = \begin{cases} \text{the corresponding value in the table,} &\quad x \in \left[ 687, 780 \right] \\ 0 \text{,} &\quad x \notin \left[ 687, 780 \right] \\ \end{cases} $$

density_table


Bin	[687, 697)	[697, 708)	[708, 718)	[718, 728)	[728, 739)	[739, 749)	[749, 759)	[759, 770)	[770, 780]
Esmtimation $f(x)$	0.0286	0.0286	0.0571	0.2	0.2	0.1714	0.1714	0.0857	0.0571

Estimation of the distribution function¶

From the empirical distribution function we estimate the distribution function

$$ \hat{F}_{\text{survived}}(x) = \begin{cases} \text{0,} &\quad x \lt 687\\ \text{the corresponding value in the table,} &\quad x \in \left[ 687, 780 \right] \\ \text{1,} &\quad x \gt 780\\ \end{cases} $$

display(distrib_table.iloc[:, :9])
display(distrib_table.iloc[:, 9:18])
display(distrib_table.iloc[:, 18:])


Bin	[687, 703)	[703, 709)	[709, 715)	[715, 721)	[721, 723)	[723, 726)	[726, 728)	[728, 729)	[729, 730)
Estimation $F(x)$	0.0286	0.0571	0.0857	0.1143	0.1429	0.2	0.2286	0.3143	0.3429


Bin	[730, 733)	[733, 735)	[735, 736)	[736, 739)	[739, 741)	[741, 743)	[743, 749)	[749, 751)	[751, 752)
Estimation $F(x)$	0.4	0.4571	0.4857	0.5143	0.5429	0.6571	0.6857	0.7143	0.7429


Bin	[752, 755)	[755, 756)	[756, 766)	[766, 767)	[767, 769)	[769, 770)	[770, 780]
Estimation $F(x)$	0.8	0.8286	0.8571	0.8857	0.9143	0.9429	0.9714

Perished¶

density_table, distrib_table = histogram(P, 6)

Density estimation¶

From the histogram we estimate the density

$$ \hat{f}_{\text{perished}}(x) = \begin{cases} \text{the corresponding value in the table,} &\quad x \in \left[ 659, 765 \right] \\ 0 \text{,} &\quad x \notin \left[ 659, 765 \right] \\ \end{cases} $$

density_table


Bin	[659, 677)	[677, 694)	[694, 712)	[712, 730)	[730, 747)	[747, 765]
Esmtimation $f(x)$	0.0417	0.0417	0.125	0.25	0.375	0.1667

Estimation of the distribution function¶

From the empirical distribution function we estimate the distribution function

$$ \hat{F}_{\text{perished}}(x) = \begin{cases} \text{0,} &\quad x \lt 659\\ \text{the corresponding value in the table,} &\quad x \in \left[ 659, 765 \right] \\ \text{1,} &\quad x \gt 765\\ \end{cases} $$

display(distrib_table.iloc[:, :7])
display(distrib_table.iloc[:, 7:14])
display(distrib_table.iloc[:, 14:21])


Bin	[659, 689)	[689, 702)	[702, 703)	[703, 709)	[709, 713)	[713, 720)	[720, 726)
Estimation $F(x)$	0.0417	0.0833	0.125	0.1667	0.2083	0.25	0.3333


Bin	[726, 729)	[729, 731)	[731, 736)	[736, 737)	[737, 738)	[738, 739)	[739, 743)
Estimation $F(x)$	0.4167	0.4583	0.5	0.5417	0.5833	0.6667	0.7083


Bin	[743, 744)	[744, 745)	[745, 752)	[752, 754)	[754, 765]
Estimation $F(x)$	0.75	0.7917	0.8333	0.9167	0.9583

Statistical model¶

For each group separately, we find the closest distribution (estimate the parameters of the normal, exponential, and uniform distributions) and plot the corresponding densities with the estimated parameters on histogram plots. We also discuss which distribution fits the observed data most accurately.

Parameter estimation¶

The parameters of the exponential and uniform distribution are estimated by the method of moments.

We estimate the moments by sample moments.

$$ \widehat{EX^k} = m_k = \frac{1}{n} \sum_{i=1}^{n} X_i^k $$

S_m1 = 1 / S_n * np.sum(S)
S_m2 = 1 / S_n * np.dot(S, S)

P_m1 = 1 / P_n * np.sum(P)
P_m2 = 1 / P_n * np.dot(P, P)

pd.DataFrame({
    "Survived": [S_m1, S_m2],
    "Perished": [P_m1, P_m2],
}, index=["Sample moment $m_1$", "Sample moment $m_2$"]).round(4)

	Survived	Perished
Sample moment $m_1$	738	727.917
Sample moment $m_2$	545026	530394

We now calculate the unbiased and consistent estimates of the parameters of the normal distribution from the previous tasks.

Normal distribution $\mathcal{N}(\mu, \sigma^2)$

$ f(x) = \frac{1}{\sqrt{2\pi \sigma^2}} e^{-\frac{\left(x - \mu\right)^2}{2 \sigma^2}} \quad \text{for } x \in \mathbb{R} $

$$\hat{\mu} = \bar{X}_n$$$$\hat{\sigma}^2 = s_n^2$$

We express the parameters of the remaining two statistical models as functions of moments.

Exponential distribution $\text{Exp}(\lambda, L)$

$ f(x) = \begin{cases} \lambda e^{-\lambda \left(x - L\right)} & \text{for } x \geq 0 \\ 0 &\text{otherwise} \end{cases} $

$$\hat{\lambda} = \frac{1}{\sqrt{m_2 - m_1^2}}$$$$\hat{L} = m_1 - \sqrt{m_2 - m_1^2}$$

Uniform distribution $\mathcal{U}(a, b)$

$ f(x) = \begin{cases} \frac{1}{b-a} &\quad \text{for } a \leq x \leq b \\ 0 &\quad \text{otherwise} \end{cases} $

$$\hat{a} = m_1 - \sqrt{3 \cdot \left( m_2 - m_1^2 \right)}$$$$\hat{b} = m_1 + \sqrt{3 \cdot \left( m_2 - m_1^2 \right)}$$

def add_est(data, m1, m2, bin_count, mean, std, title, ax):
    X = np.linspace(data.min()-10, data.max()+10, 1000)
    counts, bins = np.histogram(data, bins=bin_count)
    bins = bins.round()
    
    lamb_hat = 1 / np.sqrt(m2 - m1*m1)
    l_hat = m1 - np.sqrt(m2 - m1*m1)
    
    a_hat = m1 - np.sqrt(3 * (m2 - m1*m1))
    b_hat = m1 + np.sqrt(3 * (m2 - m1*m1))

    Y_norm = st.norm.pdf(X, loc=mean, scale=std)
    
    X_exp = np.linspace(l_hat, data.max()+10, 1000)
    Y_exp = st.expon.pdf(X_exp, loc=l_hat, scale=1/lamb_hat)
    
    sns.histplot(data, bins=bins, stat="density", ax=ax, color="grey", alpha=0.3)
    
    sns.lineplot(x=X, y=Y_norm, ax=ax, label=f"N ( {mean:.0f}, {std*std:.0f} )", linewidth=2.5, color="royalblue")
    
    sns.lineplot(x=X_exp, y=Y_exp, ax=ax, label=f"Exp ( {lamb_hat:.4f}, {l_hat:.0f} )", linewidth=2.5, color="darkorange")
    sns.lineplot(x=[X.min(), l_hat], y=0.0003, ax=ax, linewidth=2.5, color="darkorange")
    
    sns.lineplot(x=[a_hat, b_hat], y=1 / (b_hat - a_hat), ax=ax, label=f"U ( {a_hat:.0f}, {b_hat:.0f} )", linewidth=2.5, color="green")
    sns.lineplot(x=[X.min(), a_hat], y=0.0003, ax=ax, linewidth=2.5, color="green")
    sns.lineplot(x=[b_hat, X.max()], y=0.0003, ax=ax, linewidth=2.5, color="green")
    
    ax.legend(loc="upper right")
    ax.set_title(title)

fig, axes = plt.subplots(1, 2, figsize=(10, 5), layout="constrained", sharey=True)
add_est(S, S_m1, S_m2, 9, S_mean, S_std, "Survived", axes[0])
add_est(P, P_m1, P_m2, 6, P_mean, P_std, "Perished", axes[1])

Conclusion

In both groups of sparrows, it is clear that a normal distribution best fits the observed data.

Simulation¶

For each group separately, we generate a random sample of 100 values from the distribution with the parameters estimated in the previous section. We compare the histogram of simulated values with the observed data.

def plot_norm(data, mean, std, bin_count):
    fig, axes = plt.subplots(1, 2, figsize=(10, 5), layout="constrained", sharey=True)

    counts, bins = np.histogram(data, bins=bin_count)
    bins = bins.round()
    w = bins[1] - bins[0]
    
    ax = axes[0]
    sns.histplot(data, bins=bins, stat="density", ax=ax)
    ax.set_title("Original data")

    X = np.linspace(data.min()-10, data.max()+10, 1000)
    while X.max() > bins.max():
        bins = np.append(bins, [bins.max() + w])
    while X.min() < bins.min():
        bins = np.insert(bins, 0, bins.min() - w)
    
    ax = axes[1]
    new_data = st.norm.rvs(mean, std, 100, random_state=42)
    sns.histplot(new_data, bins=bins, stat="density", ax=ax, color="tomato")
    ax.set_title("Simulated data")
    ax.set_xlabel("Humerus")

Survived¶

plot_norm(S, S_mean, S_std, 9)

Perished¶

plot_norm(P, P_mean, P_std, 6)

Conclusion

Histograms are similar in both cases.

Confidence interval¶

We now calculate the two-sided $95\%$ confidence interval for the mean for each group separately.

The two-sided $100 \cdot (1 - \alpha)\%$ confidence interval for $\mu$ with unknown variance is calculated as

$$ \left( \bar{X}_n - t_{\alpha / 2, n-1} \frac{s_n}{\sqrt{n}}, \bar{X}_n + t_{\alpha / 2, n-1} \frac{s_n}{\sqrt{n}} \right) $$

def interval(data, n, mean, std, alpha):
    t_a_half = st.t.isf(alpha / 2, n - 1)
    delta = t_a_half * std / np.sqrt(n)
    return np.array([mean - delta, mean + delta])

S_ci = interval(S, S_n, S_mean, S_std, 0.05).round(4)
P_ci = interval(P, P_n, P_mean, P_std, 0.05).round(4)

pd.DataFrame({
    "Survived": [f"( {S_ci[0]} , {S_ci[1]} )"],
    "Perished": [f"( {P_ci[0]} , {P_ci[1]} )"],
}, index=["Two-sided 95% confidence interval for $\\mu$"]).T

	Two-sided 95% confidence interval for $\mu$
Survived	( 731.185 , 744.815 )
Perished	( 717.9755 , 737.8578 )

Mean value test¶

For each group separately, we test at the $5\%$ significance level the hypothesis that the mean is equal to K (the task parameter) against the two-sided alternative.

pd.DataFrame([K], index=["K"], columns=[""])


K	16

Survived¶

Test

At the significance level $\alpha = 5\%$, we test the hypothesis

$$ H_0 : \quad \mu = 16 $$$$ H_A : \quad \mu \not 16 $$

The corresponding confidence interval is

$$ (L, U) = (731.19 , 744.81)$$

The tested value $\mu_0 = 16$ does not lie in the interval.

Test conclusion

At the $5\%$ significance level, we reject the null hypothesis in favor of the alternative that the true mean arm bone length of sparrows that survived the winter is not $0.16 \text{in} \approx 4 \text{mm}$.

Perished¶

Test

At the significance level $\alpha = 5\%$, test the hypothesis

$$ H_0 : \quad \mu = 16 $$$$ H_A : \quad \mu \not 16 $$

The corresponding confidence interval is

$$ (L, U) = (717.98 , 737.86)$$

The tested value $\mu_0 = 16$ does not lie in the interval.

Test conclusion

At the $5\%$ significance level, we reject the null hypothesis in favor of the alternative that the true mean arm bone length of sparrows that did not survive the winter is not $0.16 \text{in} \approx 4 \text{mm}$.

Test for equality of means¶

At the $5\%$ significance level, we test whether the observed groups have the same mean.

Two-sample t-test¶

pd.DataFrame([np.round(S_std / P_std, 4)], index=["$s_{\text{survived}} / s_{\text{perished}}$"], columns=[""])


$s_{\text{survived}} / s_{\text{perished}}$	0.8427

We can assume that the variance of both groups of sparrows is the same/similar:

$$ \frac{s_1}{s_2} \approx 0.8427 $$$$ \frac{1}{2} \lt \frac{s_1}{s_2} \lt 2 $$

For comparison, we use a two-sample t-test of the normal distribution.

The test is performed by comparing the test statistic $T$ with the corresponding critical value of the Student's $t$-distribution:

$$ T = \frac{\bar{X}_1 - \bar{X}_2}{s_{12}} \cdot \sqrt{ \frac{n_1 n_2}{n_1 + n_2}} $$

where

$$ s_{12} = \sqrt{ \frac{ \left( n_1 - 1 \right) s_1^2 + \left( n_2 - 1 \right) s_2^2 }{ n_1 + n_2 - 2 } } $$

SP_std = np.sqrt(( (S_n - 1) * S_var + (P_n - 1) * P_var )/( S_n + P_n - 2 ))
T_stat = (S_mean - P_mean) / SP_std * np.sqrt((S_n * P_n)/(S_n + P_n))
T_crit_05_57 = st.t.isf(0.05, 57)
T_crit_025_57 = st.t.isf(0.025, 57)

pd.DataFrame(np.array([T_stat, T_crit_05_57]).round(4),
             index=["$T$", "$t_{0.05, 57}$"], columns=["Value"])

	Value
$T$	1.777
$t_{0.05, 57}$	1.672

Návrh

At the significance level $\alpha = 5\%$, test the hypothesis

$$ H_0 \colon \quad \mu_{\text{survived}} = \mu_{\text{perished}} $$$$ H_A \colon \quad \mu_{\text{survived}} > \mu_{\text{perished}} $$

The corresponding test statistic is $T = 1.777$.

Because

$$ 1.777 = T \gt t_{\alpha, n_1 + n_2 - 2} = t_{0.05, 57} = 1.672 $$

we reject the hypothesis of agreement at the $5\%$ significance level.

Conclusion¶

Based on our data, we can prove a significant difference in mean arm bone length between sparrows that survived the winter and sparrows that died during the winter. Those that survived the winter storm had a longer humerus on average.

Alternative conclusion¶

pd.DataFrame(np.array([T_stat, T_crit_025_57]).round(4),
             index=["$T$", "$t_{0.025, 57}$"], columns=["Value"])

	Value
$T$	1.777
$t_{0.025, 57}$	2.0025

If we examined only the equality/inequality of means (two-sided)

$$ H_0 \colon \quad \mu_{\text{survived}} = \mu_{\text{perished}} $$$$ H_A \colon \quad \mu_{\text{survived}} > \mu_{\text{perished}} $$$$ 1.777 = |T| \lt t_{\alpha / 2, n_1 + n_2 − 2} = t_{0.025, 57} = 2.0025 $$

the $p$-value for a two-sided test would be greater than $5\%$, in which case $H_A$ would be statistically insignificant.

p_greater = st.mstats.ttest_ind(S, P, equal_var=True, alternative="greater")
p_two_sided = st.mstats.ttest_ind(S, P, equal_var=True, alternative="two-sided")
pd.DataFrame(np.array([p_greater, p_two_sided]).round(4),
             index=["One-sided test", "Two-sided test"], columns=["T statistic", "p-value"])

	T statistic	p-value
One-sided test	1.777	0.0405
Two-sided test	1.777	0.0809

The effect of arm bone length on the survival of sparrows in winter storms

Tommy Chu

Dataset¶

Introduction¶

Objective¶

Data description¶

Point estimates¶

Distribution function estimate¶

Histogram¶

Empirical distribution function¶

Survived¶

Density estimation¶

Estimation of the distribution function¶

Perished¶

Density estimation¶

Estimation of the distribution function¶

Statistical model¶

Parameter estimation¶

Simulation¶

Survived¶

Perished¶

Confidence interval¶

Mean value test¶

Survived¶

Perished¶

Test for equality of means¶

Two-sample t-test¶

Conclusion¶

Alternative conclusion¶