{ "cells": [ { "cell_type": "markdown", "id": "63f2cb41", "metadata": {}, "source": [ "# IntroStat Week 5\n", "\n", "Simulation of the samples and QQ-plots" ] }, { "cell_type": "code", "execution_count": null, "id": "88fc8695", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import scipy.stats as stats\n", "\n", "# NEW: import statsmodels.api to do automated q-q-plot\n", "import statsmodels.api as sm" ] }, { "cell_type": "markdown", "id": "abe4110a", "metadata": {}, "source": [ "### Simulation: Sample from Normal distribution" ] }, { "cell_type": "code", "execution_count": null, "id": "55af8038", "metadata": {}, "outputs": [], "source": [ "np.random.seed(24)" ] }, { "cell_type": "code", "execution_count": null, "id": "e6c55bac", "metadata": {}, "outputs": [], "source": [ "# (repeat this cell many times)\n", "\n", "# population parameters:\n", "mu = 100\n", "sigma = 12\n", "\n", "# simulated sample:\n", "n = 10\n", "data = stats.norm.rvs(mu, sigma, size=n)\n", "\n", "# significance level (for plotting confidence interval of the mean)\n", "a = 0.05\n", "\n", "# plotting\n", "x = np.arange(70,130,0.1)\n", "plt.xlim(70,130)\n", "plt.plot(x, .2*np.sqrt(2*np.pi*sigma**2)*stats.norm.pdf(x, loc=mu, scale=sigma), color=\"red\",alpha=0.3)\n", "plt.plot(x, .2*np.sqrt(2*np.pi*sigma**2/n)*stats.norm.pdf(x, loc=mu, scale=sigma/np.sqrt(n)), color=\"black\", linestyle='--',alpha=0.3)\n", "plt.hist(data, density=True, color='deepskyblue', bins=6)\n", "plt.axvline(data.mean(), linestyle='-', color=\"blue\", ymin=0, ymax=1)\n", "plt.plot([data.mean()-data.std(ddof=1)/np.sqrt(n), data.mean()+data.std(ddof=1)/np.sqrt(n)], [0.06,0.06], linestyle='-', color=\"blue\")\n", "plt.axvline(stats.t.ppf(a/2, df=n-1)*data.std(ddof=1)/np.sqrt(n)+data.mean(), linestyle='--', color=\"blue\", ymin=0, ymax=1)\n", "plt.axvline(stats.t.ppf(1-a/2, df=n-1)*data.std(ddof=1)/np.sqrt(n)+data.mean(), linestyle='--', color=\"blue\", ymin=0, ymax=1)\n", "plt.tick_params(left = False, right = False , labelleft = False) \n", "plt.show()" ] }, { "cell_type": "markdown", "id": "abcd18b7", "metadata": {}, "source": [ "The histogram of the sample does not always look very \"normal\"\n", "\n", "(back to slides)" ] }, { "cell_type": "markdown", "id": "89ce4224", "metadata": {}, "source": [ "### ECDF and Q-Q plot" ] }, { "cell_type": "code", "execution_count": null, "id": "ef145d9b", "metadata": {}, "outputs": [], "source": [ "np.random.seed(24)" ] }, { "cell_type": "code", "execution_count": null, "id": "c162aac2", "metadata": {}, "outputs": [], "source": [ "# (repeat this cell many times)\n", "data = stats.norm.rvs(mu, sigma, size=n)\n", "\n", "# split plot in 2 \n", "fig, axs = plt.subplots(1, 2, figsize=(10*2/3,4))\n", "\n", "# plot histogram\n", "axs[0].hist(data, density=True, bins=6)\n", "axs[0].set_xlim([70,130])\n", "\n", "# plot ecdf\n", "axs[1].ecdf(data)\n", "axs[1].plot(np.arange(70,130,1), stats.norm.cdf(np.arange(70,130,1), loc=mu, scale=sigma))\n", "axs[1].set_xlim([70,130])\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "545a754a", "metadata": {}, "outputs": [], "source": [ "np.random.seed(24)" ] }, { "cell_type": "code", "execution_count": null, "id": "c13cc996", "metadata": {}, "outputs": [], "source": [ "# (repeat this cell many times)\n", "data = stats.norm.rvs(mu, sigma, size=n)\n", "\n", "# split plot in 3\n", "fig, axs = plt.subplots(1, 3, figsize=(10,4))\n", "\n", "# plot histogram\n", "axs[0].hist(data, density=True, bins=6)\n", "axs[0].set_xlim([70,130])\n", "\n", "# plot ecdf\n", "axs[1].ecdf(data)\n", "axs[1].plot(np.arange(70,130,1), stats.norm.cdf(np.arange(70,130,1), loc=mu, scale=sigma))\n", "axs[1].set_xlim([70,130])\n", "\n", "# plot qq-plot\n", "sm.qqplot(data,line=\"q\",a=3/8,ax=axs[2])\n", "# OBS: \"a = 3/8\" is preferred for n <= 10 \n", "# (\"a = 1/2\" is preferred for n > 10) \n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "ecaea6b5", "metadata": {}, "source": [ "Now try larger samples:" ] }, { "cell_type": "code", "execution_count": null, "id": "03f7723b", "metadata": {}, "outputs": [], "source": [ "np.random.seed(24)" ] }, { "cell_type": "code", "execution_count": null, "id": "a16dc51d", "metadata": {}, "outputs": [], "source": [ "# (repeat this cell many times)\n", "\n", "n = 100 # larger sample size\n", "data = stats.norm.rvs(mu, sigma, size=n)\n", "\n", "# split plot in 3\n", "fig, axs = plt.subplots(1, 3, figsize=(10,4))\n", "\n", "# plot histogram\n", "axs[0].hist(data, density=True)\n", "axs[0].set_xlim([70,130])\n", "\n", "# plot ecdf\n", "axs[1].ecdf(data)\n", "axs[1].plot(np.arange(70,130,1), stats.norm.cdf(np.arange(70,130,1), loc=mu, scale=sigma))\n", "axs[1].set_xlim([70,130])\n", "\n", "# plot qq-plot\n", "sm.qqplot(data,line=\"q\",a=1/2,ax=axs[2])\n", "# OBS: \"a = 3/8\" is preferred for n <= 10 \n", "# (\"a = 1/2\" is preferred for n > 10) \n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "d2712d4e", "metadata": {}, "outputs": [], "source": [ "# examples with exponentially distributed data:\n", "\n", "n = 10 # try both n=10 and n=100\n", "data = stats.expon.rvs(mu, sigma, size=n)\n", "fig, axs = plt.subplots(1, 3, figsize=(10,4))\n", "axs[0].hist(data, density=True)\n", "axs[1].ecdf(data)\n", "axs[1].plot(np.arange(70,150,1), stats.norm.cdf(np.arange(70,150,1), loc=stats.expon.mean(loc=mu, scale=sigma), scale=stats.expon.std(loc=mu, scale=sigma)))\n", "axs[1].set_xlim(70,150)\n", "sm.qqplot(data,line=\"q\",a=1/2,ax=axs[2])\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "9afd5a65", "metadata": {}, "outputs": [], "source": [ "# examples with uniformly distributed data:\n", "\n", "n = 10 # try both n=10 and n=100\n", "data = stats.uniform.rvs(mu, sigma, size=n)\n", "fig, axs = plt.subplots(1, 3, figsize=(10,4))\n", "axs[0].hist(data, density=True)\n", "axs[1].ecdf(data)\n", "axs[1].plot(np.arange(90,120,1), stats.norm.cdf(np.arange(90,120,1), loc=stats.uniform.mean(loc=mu, scale=sigma), scale=stats.uniform.std(loc=mu, scale=sigma)))\n", "axs[1].set_xlim(90,120)\n", "sm.qqplot(data,line=\"q\",a=1/2,ax=axs[2])\n", "\n", "plt.tight_layout()\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Pernille_Env_python_311", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 5 }