{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Two way ANOVA \n", "\n", "(week 11 - two way ANOVA)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "import scipy.stats as stats\n", "import statsmodels.api as sm\n", "import statsmodels.formula.api as smf\n", "import statsmodels.stats.power as smp\n", "import statsmodels.stats.proportion as smprop\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1) Introduction and data example" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To introduce two-way ANOVA we start by considering the following data (and do some data visuallisation)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Make pandas dataframe with grouped data:\n", "data = pd.DataFrame({\n", " 'value': [2.8, 3.6, 3.4, 2.3, 5.5, 6.3, 6.1, 5.7, 5.8, 8.3, 6.9, 6.1], \n", " 'treatment': [\"A\", \"A\", \"A\", \"A\", \"B\", \"B\", \"B\", \"B\", \"C\", \"C\", \"C\", \"C\"],\n", " 'block': [\"1\", \"2\", \"3\", \"4\", \"1\", \"2\", \"3\", \"4\", \"1\", \"2\", \"3\", \"4\"]})\n", "data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Alternative way of typing the data:\n", "data = pd.DataFrame({\n", " 'value': [2.8, 3.6, 3.4, 2.3, 5.5, 6.3, 6.1, 5.7, 5.8, 8.3, 6.9, 6.1], \n", " 'treatment': np.repeat([\"A\", \"B\", \"C\"], 4),\n", " 'block': np.tile([\"1\", \"2\", \"3\", \"4\"], 3)})\n", "data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Alternative way of typing the data (more like the book):\n", "\n", "# OBS: in lecture copy paste from book p 329 to show issue with special characters\n", "\n", "y = np.array([2.8, 3.6, 3.4, 2.3,\n", " 5.5, 6.3, 6.1, 5.7,\n", " 5.8, 8.3, 6.9, 6.1])\n", "treatm = pd.Categorical([1, 1, 1, 1,\n", " 2, 2, 2, 2,\n", " 3, 3, 3, 3])\n", "block = pd.Categorical([1, 2, 3, 4,\n", " 1, 2, 3, 4,\n", " 1, 2, 3, 4])\n", "D = pd.DataFrame({'y': y, 'treatm': treatm, 'block': block})\n", "D" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Different ways of typing data result in the same kind of dataframe (table with data)\n", "\n", "Be aware of copy-paste issues with special characters " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Visualising the data with box plots:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, axs = plt.subplots(1, 2, figsize=(10, 5))\n", "data.boxplot(\"value\", by=\"treatment\", ax=axs[0], showmeans=True)\n", "data.boxplot(\"value\", by=\"block\", ax=axs[1], showmeans=True)\n", "fig.suptitle(\"Boxplots\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "KAHOOT! (x2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2) SST, SS(Tr) and SS(Bl)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Compute the overall mean and add to dataframe:\n", "data['overall_mean'] = data[\"value\"].mean()\n", "data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# compute the mean within each treatment-group and add to dataframe:\n", "data['treatment_mean'] = data.groupby(\"treatment\")['value'].transform('mean')\n", "data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# compute the mean within each person (block) and add to dataframe:\n", "data['block_mean'] = data.groupby(\"block\")['value'].transform('mean')\n", "data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# SST\n", "SST = np.sum((data['value'] - data['overall_mean'])**2)\n", "print(SST)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# SS(Tr)\n", "SSTr = np.sum((data['treatment_mean'] - data['overall_mean'])**2)\n", "print(SSTr)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# SS(Bl)\n", "SSBl = np.sum((data['block_mean'] - data['overall_mean'])**2)\n", "print(SSBl)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3) Estimate parameters $\\mu$, $\\alpha_i$, $\\beta_j$ and $\\sigma^2$ (MSE)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "$Y_{ij} = \\mu + \\alpha_i + \\beta_j + \\epsilon_{ij}$" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We start with residuals (and SSE):" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# alpha (for each Treatment group):\n", "data[\"treatment_alpha\"] = data[\"treatment_mean\"] - data[\"overall_mean\"]\n", "data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# beta (for each Block):\n", "data[\"block_beta\"] = data[\"block_mean\"] - data[\"overall_mean\"]\n", "data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Residuals" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# calculate the residuals according to the model:\n", "data['residual'] = data['value'] - (data['overall_mean']+data['treatment_alpha']+data['block_beta'])\n", "data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "SSE = np.sum(data['residual']**2)\n", "print(SSE)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# sanity check for decomposition of variation (SST = SSTr + SSBl + SSE):\n", "print(SST)\n", "print(SSTr + SSBl + SSE)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# estimating the residual variance:\n", "MSE = SSE/((3-1)*(4-1)) # MSE = SSE /((k-1)*(l-1))\n", "sigma = np.sqrt(MSE)\n", "print(sigma)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4) F-test and ANOVA table" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "TReatment groups:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "$F_{Tr}=\\frac{SS(Tr)/(k-1)}{SSE/((k-1)(l-1))}$" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Compute F-statistic (for treatment groups)\n", "FTr = (SSTr/(3-1)) / (SSE/((3-1)*(4-1)))\n", "print(FTr)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# compute critical value:\n", "stats.f.ppf(0.95, dfn = (3-1), dfd = (3-1)*(4-1) )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# p-value\n", "p_Tr = 1 - stats.f.cdf(FTr, dfn = (3-1), dfd = (3-1)*(4-1) )\n", "print(p_Tr)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Blocks:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "$F_{Bl}=\\frac{SS(Bl)/(l-1)}{SSE/((k-1)(l-1))}$" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Compute F-statistic (for block)\n", "FBl = (SSBl/(4-1)) / (SSE/((3-1)*(4-1)))\n", "print(FBl)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# compute critical value:\n", "stats.f.ppf(0.95, dfn = (4-1), dfd = (3-1)*(4-1) )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# p-value\n", "p_Bl = 1 - stats.f.cdf(FBl, dfn = (4-1), dfd = (3-1)*(4-1) )\n", "print(p_Bl)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Make the ANOVA table:\n", "fit = smf.ols(\"value ~ treatment + block\", data=data).fit()\n", "anova_table = sm.stats.anova_lm(fit)\n", "print(anova_table)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In Python the ANOVA table does not have a row for \"total sum of squares\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "KAHOOT! (x2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5) Model control" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "$Y_{ij} = \\mu + \\alpha_i + \\beta_j + \\epsilon_{ij}$" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# QQplot:\n", "sm.qqplot(data[\"residual\"], line='q',a=1/2)\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Histogram:\n", "plt.hist(data[\"residual\"])\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Normalfordelt?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Er de også uafhængige? (fx uafhængige af gruppering)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Plot residuals grouped by either treatment or block:\n", "fig, axs = plt.subplots(1, 2, figsize=(10, 5))\n", "data.boxplot(\"residual\", by=\"treatment\", showmeans=True, ax=axs[0])\n", "data.boxplot(\"residual\", by=\"block\", showmeans=True, ax=axs[1])\n", "fig.suptitle(\"Boxplots of residuals\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Hvordan vurderer vi det visuelt?\n", "\n", "Er der \"systematiske\" tendenser (mønster)?\n", "\n", "Hvad er trekanterne i plottet ovenfor (og hvorfor er de alle nul?)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6) Example from book" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We do this together during the lecture" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "D = pd.DataFrame({\n", "'y': [22.5, 24.3, 24.9, 22.4,\n", "21.5, 21.3, 23.9, 18.4,\n", "22.2, 21.9, 21.7, 17.9],\n", "'car': pd.Categorical([1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]),\n", "'tire': pd.Categorical([1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]),\n", "})\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "D" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(D['y'].mean())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(D.groupby(\"tire\")['y'].mean())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, ax = plt.subplots(ncols=2)\n", "D.boxplot(column='y', by='tire', ax=ax[0],grid=False)\n", "ax[0].set_title('Boxplots by tire')\n", "D.boxplot(column='y', by='car', ax=ax[1],grid=False)\n", "ax[1].set_title('Boxplots by car')\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fit = smf.ols('y ~ car + tire', data=D).fit()\n", "anova = sm.stats.anova_lm(fit)\n", "print(anova)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "D['residuals'] = fit.resid.values \n", "D" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.hist(D['residuals'])\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sm.qqplot(D['residuals'], line='q', a=1/2)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, ax = plt.subplots(ncols=2)\n", "D.boxplot(column='residuals', by='car', ax=ax[0],grid=False, showmeans=True)\n", "ax[0].set_title('Residuals by car')\n", "D.boxplot(column='residuals', by='tire', ax=ax[1],grid=False, showmeans=True)\n", "ax[1].set_title('Residuals by tire')\n", "plt.suptitle('')\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "MSE = anova.loc['Residual', 'mean_sq']\n", "print(MSE)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "D" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.sum(D['residuals']**2)/(6)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "M = 3 * (3 - 1) / 2 \n", "m = 4 \n", "LSD_bonf = stats.t.ppf(1-(0.05/M)/2, 6) * np.sqrt(2*MSE/m)\n", "print(LSD_bonf)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(D.groupby('tire',observed=True)['y'].mean())" ] } ], "metadata": { "kernelspec": { "display_name": "Pernille_Env_python_311", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 2 }