{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "from ipywidgets import *\n", "import matplotlib.pyplot as plt\n", "from IPython.display import set_matplotlib_formats\n", "set_matplotlib_formats('svg')\n", "import numpy as np\n", "import scipy.stats as stats" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def corr_plot(slope=1, sd=0, typ='kowariancja', show=False):\n", " x = np.linspace(0, 10, 100)\n", " y = slope * x + np.random.normal(0,sd,100)\n", " fig, axes = plt.subplots(figsize=(6,6))\n", " plt.scatter(x, y)\n", " c = np.corrcoef(x, y)[0][1] if typ==\"korelacja\" else np.cov(x,y)[0][1]\n", " c = round(c,3)\n", " title = \"c=\"+str(c) if typ=='kowariancja' else \"r=\"+str(c)\n", " if show:\n", " plt.title(title)\n", " plt.xlim(0,10)\n", " plt.ylim(-10,10)\n", " \n", "def corr_examples(example=\"1\", typ='kowariancja', show=False):\n", " x = np.linspace(1, 10, 100)\n", " y = -x**3 + np.random.normal(0,100,100) if example==\"1\" else np.cos(x) + np.random.normal(0,0.1,100)\n", " plt.scatter(x, y)\n", " if show:\n", " if typ==\"korelacja\":\n", " plt.title(\"r=\"+str(round(np.corrcoef(x, y)[0][1],3)))\n", " else:\n", " plt.title(\"c=\"+str(round(np.cov(x, y)[0][1],3)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Korelacja i regresja liniowa" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Kowariancja" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- Kowariancja zmiennych losowych\n", "\n", "$$\\sigma^2_X = E[(X-\\mu)^2]$$\n", "\n", "$$\\sigma_{X,Y} = E[(X-\\mu_x)(Y-\\mu_y)]$$\n", "\n", "$$\\sigma_{X,X} =?$$\n", "\n", "- Estymator kowariancji\n", "\n", "$$S_{XY}=\\frac{1}{n-1}\\sum_{i=1}^{n}(X_i-\\bar{X})(Y_i-\\bar{Y})$$\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Przykład:\n", "\n", "| X | Y |\n", "| --- | --- |\n", "| 1 | 5 |\n", "| 2 | 6 |\n", "| 3 | 1 |" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "mx = \n", "\n", "my = \n", "\n", "sxy =" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c9b839df554442b5ad199355f1521b3a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "interactive(children=(FloatSlider(value=1.0, description='slope', max=1.0, min=-1.0), IntSlider(value=0, descr…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "interact(corr_plot, slope=(-1, 1, 0.1), sd=(0,10,1), typ=[\"kowariancja\", \"korelacja\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- Kowariancja a niezależność zmiennych\n", "\n", "$\\;\\;\\;\\;\\;E[(X-\\mu_x)(Y-\\mu_y)] = E[XY - X\\mu_Y-Y\\mu_X+\\mu_X\\mu_Y]=E[XY]-E[X\\mu_Y]-E[Y\\mu_X]+\\mu_X\\mu_Y=E[XY]-\\mu_X\\mu_Y$\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a53b214dd7a94ac88554c21cdfec8ab4", "version_major": 2, "version_minor": 0 }, "text/plain": [ "interactive(children=(Dropdown(description='example', options=('1', '2'), value='1'), Dropdown(description='ty…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "interact(corr_examples, typ=[\"kowariancja\", \"korelacja\"], example=[\"1\", \"2\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- Własności kowariancji\n", " - $\\sigma_{X,Y} = \\sigma_{Y,X}$\n", " - $\\sigma_{X,k} = 0$\n", " - $\\sigma_{kX,Y} = k\\sigma_{X,Y}$\n", " - $\\sigma_{X,Y+Z} = \\sigma_{X,Y} + \\sigma_{X,Z}$ " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Współczynnik korelacji liniowej Pearsona" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- Korelacja zmiennych losowych\n", "\n", "$$\\rho=\\frac{cov(X,Y)}{\\sigma_X\\sigma_Y}$$\n", "\n", "- Estymator współczynnika korelacji\n", "\n", "$$r = \\frac{S_{XY}}{S_X S_Y} \\frac{\\sum_{i=1}^{n}(X_i-\\bar{X})(Y_i-\\bar{Y})}{\\sqrt{(\\sum_{i=1}^{n}(X_i-\\bar{X})^2)(\\sum_{i=1}^{n}(Y_i-\\bar{Y})^2)}}$$" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Przykład c.d.:\n", "\n", "$S_X$ = 1\n", "\n", "$S_Y$ = 2.646\n", "\n", "$r = $" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3d31f59d6e564c98b0094efe71e5b1b9", "version_major": 2, "version_minor": 0 }, "text/plain": [ "interactive(children=(FloatSlider(value=1.0, description='slope', max=1.0, min=-1.0), IntSlider(value=0, descr…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "interact(corr_plot, slope=(-1, 1, 0.1), sd=(0,10,1), typ=[\"kowariancja\", \"korelacja\"])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "dcddda7a357a46528830530bd4d239eb", "version_major": 2, "version_minor": 0 }, "text/plain": [ "interactive(children=(Dropdown(description='example', options=('1', '2'), value='1'), Dropdown(description='ty…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "interact(corr_examples, typ=[\"kowariancja\", \"korelacja\"], example=[\"1\", \"2\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Test na istotność współczynnika korelacji" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- Układ hipotez:\n", "\n", "$\\;\\;\\;\\;\\;\\;H_0: \\rho=0$ \n", "
$\\;\\;\\;\\;\\;\\;H_1: \\rho > / \\neq / < 0$\n", "\n", "- Statystyka:\n", "$$t=\\frac{r}{\\sqrt{1-r^2}}\\sqrt{n-2}~\\sim~t(n-2)$$" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Prosta regresja liniowa" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e1685c0b8c7a476792ec338a9e114cac", "version_major": 2, "version_minor": 0 }, "text/plain": [ "interactive(children=(FloatSlider(value=1.0, description='slope', max=1.0, min=-1.0), IntSlider(value=0, descr…" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "interact(corr_plot, slope=(-1, 1, 0.1), sd=(0,10,1), typ=[\"kowariancja\", \"korelacja\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "$Y = \\hat{Y} + \\epsilon$\n", "\n", "$Y=\\beta_0+\\beta_1X+\\epsilon$" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "![regresja](http://www.cs.put.poznan.pl/amensfelt/pub/wline_names.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Założenia:\n", "\n", "- Zależność liniowa między X i Y\n", "- Wartości zmiennej niezależnej X są ustalone\n", "- $\\epsilon \\sim N(0,\\sigma^2)$" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Metoda najmniejszych kwadratów" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "$Y=b_0+b_1X+e$\n", "\n", "$\\hat{Y}=b_0+b_1X$\n", "\n", "$e_i=y_i-\\hat{y_i}$" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "![mnk](http://www.cs.put.poznan.pl/amensfelt/pub/wline_mnk.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- Suma kwadratów rezyduów\n", "\n", "$$S(b_0,b_1) = \\sum\\limits_{i=1}^n (y_i-\\hat{y_i})^2 = \\sum\\limits_{i=1}^n(y_i-(b_1x_i+b_0))^2$$" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- Wyraz wolny\n", "\n", "$$b_0= \\bar{y}-b_1\\bar{x}$$\n", "\n", "- Współczynnik kierunkowy\n", "\n", "$$b_1 =\\frac{s_{xy}}{s^2_x} = r\\frac{S_Y}{S_X}$$" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- prosta regresji przechodzi przez $(\\bar{x}, \\bar{y})$\n", "\n", "- znak($b_1$) = znak($r$)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Przykład c.d. 2:\n", "\n", "b1 = \n", "\n", "b0 = \n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n" ], "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "x = np.array([1,2,3])\n", "y = np.array([5,6,1])\n", "plt.scatter(x, y)\n", "a = 1 #TODO\n", "b = 0 #TODO\n", "plt.plot(x, x*a+b)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Kwartet Anscombe’a" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "![kwartet](https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Anscombe%27s_quartet_3.svg/1000px-Anscombe%27s_quartet_3.svg.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[Źródło](https://en.wikipedia.org/wiki/Anscombe%27s_quartet)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" }, "latex_envs": { "LaTeX_envs_menu_present": true, "autoclose": false, "autocomplete": true, "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 1, "hotkeys": { "equation": "Ctrl-E", "itemize": "Ctrl-I" }, "labels_anchors": false, "latex_user_defs": false, "report_style_numbering": false, "user_envs_cfg": false } }, "nbformat": 4, "nbformat_minor": 4 }