{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Nutrition study" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import scipy.stats as stats\n", "import statsmodels.api as sm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Example: Nutrition study 1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First visualise the data :)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Enter data\n", "A = np.array([7.53, 7.48, 8.08, 8.09, 10.15, 8.40, 10.88, 6.13, 7.90])\n", "B = np.array([9.21, 11.51, 12.79, 11.85, 9.97, 8.79, 9.69, 9.68, 9.19])\n", "\n", "plt.hist(A, alpha=0.7)\n", "plt.hist(B, alpha=0.7)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Lets calculate mean and sample standard deviation for each group" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mean_A = A.mean()\n", "mean_B = B.mean()\n", "print([mean_A, mean_B])\n", "\n", "s_A = A.std(ddof=1)\n", "s_B = B.std(ddof=1)\n", "print([s_A, s_B])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can plot the mean values as well" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.hist(A, alpha=0.5)\n", "plt.hist(B, alpha=0.5)\n", "plt.axvline(mean_A, linestyle='-', color=\"blue\", ymin=0, ymax=1)\n", "plt.axvline(mean_B, linestyle='-', color=\"orange\", ymin=0, ymax=1)\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Great!\n", "But are they the same?\n", "\n", "What else can we do?\n", "\n", "lets calculate standard error (se) for each group mean and add the confidence intervals " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "n_A = len(A) \n", "n_B = len(B)\n", "print([n_A, n_B])\n", "se_A = s_A/np.sqrt(n_A)\n", "se_B = s_B/np.sqrt(n_B)\n", "print([se_A, se_B])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Need to choose alpha - we choose alpha = 0.05 \n", "t_0975 = stats.t.ppf(0.975, df=n_A-1) # same for B since n_A = n_B\n", "\n", "CI_A = [mean_A - t_0975 * se_A, mean_A + t_0975 * se_A]\n", "CI_B = [mean_B - t_0975 * se_B, mean_B + t_0975 * se_B]\n", "print(CI_A)\n", "print(CI_B)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.hist(A, alpha=0.5)\n", "plt.hist(B, alpha=0.5)\n", "plt.axvline(mean_A, linestyle='-', color=\"blue\", ymin=0, ymax=1)\n", "plt.axvline(mean_B, linestyle='-', color=\"orange\", ymin=0, ymax=1)\n", "plt.axvline(CI_A[0], linestyle='--', color=\"blue\", ymin=0, ymax=1)\n", "plt.axvline(CI_A[1], linestyle='--', color=\"blue\", ymin=0, ymax=1)\n", "plt.axvline(CI_B[0], linestyle='--', color=\"orange\", ymin=0, ymax=1)\n", "plt.axvline(CI_B[1], linestyle='--', color=\"orange\", ymin=0, ymax=1)\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Another typical way to plot (only mean and CI):" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.plot([1], mean_A, 'o', color=\"blue\")\n", "plt.plot([2], mean_B, 'o', color=\"orange\")\n", "plt.plot([1,1], CI_A, linestyle='-', color=\"blue\")\n", "plt.plot([2,2], CI_B, linestyle='-', color=\"orange\")\n", "plt.xlim([0,3])\n", "plt.xticks([1,2], [\"group A\", \"group B\"])\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Example: Nutrition study 2:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Define the null hypothesis\n", "mean_null_hyp = 0" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# calculate estimated difference\n", "diff = mean_B - mean_A\n", "print(diff)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# calculate standard error of the difference\n", "se_diff = np.sqrt(se_A**2 + se_B**2)\n", "print(se_diff)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Compute the \"test statistic\" from the oberserved data\n", "tobs = (diff - mean_null_hyp) / se_diff\n", "print(tobs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# compute v (degrees of freedom for difference-test)\n", "v = (se_A**2 + se_B**2)**2 / (se_A**4/(n_A-1) + se_B**4/(n_B-1))\n", "print(v)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# find t_0.975 from t-distribution with df = v\n", "stats.t.ppf(0.975, df=v)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# compare t_obs with t_0.975 (critical value)\n", "print(tobs)\n", "print(stats.t.ppf(0.975, df=v))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "t_obs is greater than the critical value - we should reject the null hypothesis (if using significance level alpha = 0.05)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# calculate p-value:\n", "2*stats.t.cdf(-tobs, df=v)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "p-value is smaller than 0.05" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# You can also use the ttest_ind function from scipy.stats:\n", "# \"_ind\" is for independent - i.e., two independent samples\n", "test = stats.ttest_ind(B,A, equal_var=False) \n", "# \"equal_var=False\" means we are NOT assuming equal variances - this is a \"Welch t-test\"\n", "print(test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Since 0.001 < p < 0.01 we conclude there is \"Strong evidence against H_0\"\n", "\n", "And we conclude that the average energy usage of nurses in Hospital B is significantly different from that of Hospital A. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Example: Nutrition study 3:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Define the null hypothesis\n", "mean_null_hyp = 0" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# calculate estimated difference\n", "diff = mean_B - mean_A\n", "print(diff)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# print the 2 sample variances\n", "print([s_A**2, s_B**2])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "notice that the two sample variances are almost the same - so maybe the assumption of equal variances is justified (?)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# calculate pooled estimate of variance (here we calculate s_p, so take the square root of the variance)\n", "s_p = np.sqrt(( (n_A-1)*s_A**2 + (n_B-1)*s_B**2 ) / (n_A + n_B - 2))\n", "\n", "# print the pooled variance (to check the value)\n", "print(s_p**2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The pooled sample variance is close to the values for the two sample variances - to value seems correct" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# calculate standard error of the difference\n", "se_diff_pooled = np.sqrt(s_p**2/n_A + s_p**2/n_B)\n", "print(se_diff_pooled)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Compute the \"test statistic\" from the oberserved data\n", "tobs_pooled = (diff - mean_null_hyp) / se_diff_pooled\n", "print(tobs_pooled)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# compute v (degrees of freedom for difference-test)\n", "v_pooled = n_A + n_B - 2\n", "print(v_pooled)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# find t_0.975 from t-distribution with df = v\n", "stats.t.ppf(0.975, df=v_pooled)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# compare t_obs with t_0.975 (critical value)\n", "print(tobs_pooled)\n", "print(stats.t.ppf(0.975, df=v_pooled))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "t_obs is greater than the critical value - we should reject the null hypothesis (if using significance level alpha = 0.05)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# calculate p-value:\n", "2*stats.t.cdf(-tobs, df=v_pooled)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "p-value is smaller than 0.05" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# You can also use the ttest_ind function from scipy.stats:\n", "# \"_ind\" is for independent - i.e., two independent samples\n", "test_pooled = stats.ttest_ind(B,A, equal_var=True) \n", "# \"equal_var=True\" means we ARE assuming equal variances - this is a \"pooled t-test\"\n", "print(test_pooled)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Since 0.001 < p < 0.01 we conclude there is \"Strong evidence against H_0\"\n", "\n", "And we conclude that the average energy usage of nurses in Hospital B is significantly different from that of Hospital A. \n", "\n", "Notice the Welch and the pooled t-test give almost the same result in this case!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Example: Nutrition study 4:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Confidence interval:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# we use the Welch version with degrees of freedom stored in variable \"v\"\n", "print(v)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# calculate confidence interval upper and lowe limits:\n", "diff_lower = diff - stats.t.ppf(0.975, df=v)*se_diff\n", "diff_upper = diff + stats.t.ppf(0.975, df=v)*se_diff\n", "print([diff_lower,diff_upper])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# You can also retrieve the confidence interval from the \"test\" calculated with stats.ttest_ind:\n", "test = stats.ttest_ind(B,A,equal_var=False) \n", "print(test.confidence_interval(0.95))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The confidence interval does not include the value 0." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Example: Nutrition study 5:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "An assumption in the t-test is that the samples come from underlying normal distributions or they are large enough for CLT to apply (30 or more in each sample).\n", "\n", "Here the sameples are small (nA = nB = 9). So we need to assume that the populations are normal. We could check this visually with QQ-plots." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# plot the data again - do the distributions look normal?\n", "\n", "plt.hist(A, alpha=0.7)\n", "plt.hist(B, alpha=0.7)\n", "plt.title(\"Hospital A (blue) and Hospital B (orange)\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Lets make the QQ-plots:\n", "fig, axs = plt.subplots(1, 2, figsize=(10,4))\n", "sm.qqplot(A,line=\"q\",a=3/8,ax=axs[0]) \n", "sm.qqplot(B,line=\"q\",a=3/8,ax=axs[1]) \n", "# OBS: \"a = 3/8\" is preferred for n <= 10 \n", "# (\"a = 1/2\" is preferred for n > 10) \n", "axs[0].set_title(\"Hospital A\")\n", "axs[1].set_title(\"Hospital B\")\n", "plt.show()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Its difficult to tell whether it looks normal (but some points are deviating from the straight line)\n", "\n", "We can try to compare with simulated data from normal distributions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# for Hospital A\n", "fig, axs = plt.subplots(3, 3, figsize=(8,8))\n", "\n", "# data in first subplot (upper left corner):\n", "sm.qqplot(A,line=\"q\",a=3/8,ax=axs[0,0]) \n", "\n", "# simulated data in all other subplots:\n", "sm.qqplot(stats.norm.rvs(size=len(A)),line=\"q\",a=3/8,ax=axs[0,1]) \n", "sm.qqplot(stats.norm.rvs(size=len(A)),line=\"q\",a=3/8,ax=axs[0,2]) \n", "sm.qqplot(stats.norm.rvs(size=len(A)),line=\"q\",a=3/8,ax=axs[1,0]) \n", "sm.qqplot(stats.norm.rvs(size=len(A)),line=\"q\",a=3/8,ax=axs[1,1]) \n", "sm.qqplot(stats.norm.rvs(size=len(A)),line=\"q\",a=3/8,ax=axs[1,2]) \n", "sm.qqplot(stats.norm.rvs(size=len(A)),line=\"q\",a=3/8,ax=axs[2,0]) \n", "sm.qqplot(stats.norm.rvs(size=len(A)),line=\"q\",a=3/8,ax=axs[2,1]) \n", "sm.qqplot(stats.norm.rvs(size=len(A)),line=\"q\",a=3/8,ax=axs[2,2]) \n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# for Hospital B\n", "fig, axs = plt.subplots(3, 3, figsize=(8,8))\n", "\n", "# data in first subplot (upper left corner):\n", "sm.qqplot(B,line=\"q\",a=3/8,ax=axs[0,0]) \n", "\n", "# simulated data in all other subplots:\n", "sm.qqplot(stats.norm.rvs(size=len(B)),line=\"q\",a=3/8,ax=axs[0,1]) \n", "sm.qqplot(stats.norm.rvs(size=len(B)),line=\"q\",a=3/8,ax=axs[0,2]) \n", "sm.qqplot(stats.norm.rvs(size=len(B)),line=\"q\",a=3/8,ax=axs[1,0]) \n", "sm.qqplot(stats.norm.rvs(size=len(B)),line=\"q\",a=3/8,ax=axs[1,1]) \n", "sm.qqplot(stats.norm.rvs(size=len(B)),line=\"q\",a=3/8,ax=axs[1,2]) \n", "sm.qqplot(stats.norm.rvs(size=len(B)),line=\"q\",a=3/8,ax=axs[2,0]) \n", "sm.qqplot(stats.norm.rvs(size=len(B)),line=\"q\",a=3/8,ax=axs[2,1]) \n", "sm.qqplot(stats.norm.rvs(size=len(B)),line=\"q\",a=3/8,ax=axs[2,2]) \n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "When comparing to simulated data, it looks like our data could very well come from an underlying normal distribution.\n", "\n", "This is no proof of normality(!), but it can give us an idea whether the assumption could be true." ] } ], "metadata": { "kernelspec": { "display_name": "Pernille_Env_python_311", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 2 }