{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Logistic regression" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Logistic regression can be used to learn the probability of an event being true or false as a function of one or more features.\n", "\n", "Here we present as example a simple 'linear' logistic regression." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We import the following packages, classes, and functions." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# for handling data:\n", "import numpy as np\n", "import pandas as pd\n", "\n", "# for plotting:\n", "import matplotlib.pyplot as plt\n", "\n", "import halerium.core as hal\n", "\n", "# for graphs:\n", "from halerium.core import Graph, Entity, Variable, StaticVariable\n", "from halerium.core.regression import linear_regression, polynomial_regression, connect_via_regression\n", "from halerium.core.distribution import BernoulliDistribution\n", "\n", "# for models:\n", "from halerium.core import DataLinker, get_data_linker\n", "from halerium.core.model import MAPModel, ForwardModel, Trainer\n", "from halerium.core.model import get_posterior_model\n", "\n", "# for predictions:\n", "from halerium import Predictor\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Example data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To create example data, we simply build a forward model of logistic regression:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "n_data = 100\n", "\n", "x_scatter = 10\n", "\n", "with Graph(\"graph\") as graph:\n", "\n", " x = Variable(\"x\", shape=(2,), mean=0, variance=x_scatter**2)\n", " y = Variable(\"y\", shape=(), distribution=BernoulliDistribution)\n", "\n", " connect_via_regression(\n", " name_prefix=\"parameters\",\n", " inputs=x,\n", " outputs=y,\n", " order=1,\n", " )\n", "\n", "slope = graph.parameters_y.location.slope\n", "intercept = graph.parameters_y.location.intercept\n", "\n", "model = ForwardModel(graph, data=DataLinker(n_data))\n", "x_data, y_data, slope_data, intercept_data = model.get_example((x, y, slope, intercept))\n", "\n", "data = pd.DataFrame()\n", "data[\"x_1\"] = x_data[:,0]\n", "data[\"x_2\"] = x_data[:,1]\n", "data[\"y\"] = y_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's visualize the generated data:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "x_true = x_data[y_data]\n", "x_false = x_data[~y_data]\n", "plt.plot(x_true[:,0], x_true[:,1], '.b')\n", "plt.plot(x_false[:,0], x_false[:,1], '+r')\n", "\n", "r = np.linspace((-1,-1), (1, 1)) * 3 * x_scatter / np.linalg.norm(slope_data)\n", "x_r = r * (np.array([[0,1],[-1,0]]) @ slope_data) - intercept_data / np.linalg.norm(slope_data)**2 * slope_data\n", "plt.plot(x_r[:,0], x_r[:,1], '--k');\n", "plt.xlabel(\"$x_1$\");\n", "plt.ylabel(\"$x_2$\");\n", "plt.legend([\"$y$=true\", \"$y$=false\", \"$p_{true}=1/2$\"]);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Logistic regression model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now let us bulid an train a logistic regression model." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with Graph(\"graph\") as graph:\n", " \n", " x = Variable(\"x\", shape=(2,), mean=0, variance=x_scatter**2)\n", " y = Variable(\"y\", shape=(), distribution=BernoulliDistribution)\n", "\n", " connect_via_regression(\n", " name_prefix=\"parameters\",\n", " inputs=x,\n", " outputs=y,\n", " order=1,\n", " ) \n", " \n", "trained_graph = Trainer(graph=graph, data = {graph.x: data[[\"x_1\", \"x_2\"]], graph.y: data[\"y\"]})()\n", "\n", "trained_graph.parameters_y.location.slope.mean" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can use the trained graph to predict the probability of y=true for a given set of x-values:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
x_1x_2p_y_pred
0-10.00.00.913
1-8.00.00.854
2-6.00.00.812
3-4.00.00.718
4-2.00.00.619
50.00.00.505
62.00.00.396
74.00.00.293
86.00.00.198
98.00.00.150
1010.00.00.094
\n", "
" ], "text/plain": [ " x_1 x_2 p_y_pred\n", "0 -10.0 0.0 0.913\n", "1 -8.0 0.0 0.854\n", "2 -6.0 0.0 0.812\n", "3 -4.0 0.0 0.718\n", "4 -2.0 0.0 0.619\n", "5 0.0 0.0 0.505\n", "6 2.0 0.0 0.396\n", "7 4.0 0.0 0.293\n", "8 6.0 0.0 0.198\n", "9 8.0 0.0 0.150\n", "10 10.0 0.0 0.094" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "prediction_x_data = np.linspace((-10, 0), (10, 0), 11)\n", "\n", "predictor = Predictor(graph=trained_graph, data={trained_graph.x: prediction_x_data}, n_samples=1000)\n", "prediction_y_data = predictor(trained_graph.y)\n", "\n", "prediction_data = pd.DataFrame()\n", "prediction_data[\"x_1\"] = prediction_x_data[:,0]\n", "prediction_data[\"x_2\"] = prediction_x_data[:,1]\n", "prediction_data[\"p_y_pred\"] = prediction_y_data\n", "display(prediction_data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can also compare the predicted vs. true $p_{true}=1/2$-line:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "x_true = data[data[\"y\"]][[\"x_1\", \"x_2\"]]\n", "x_false = data[~data[\"y\"]][[\"x_1\", \"x_2\"]]\n", "\n", "plt.plot(x_true[\"x_1\"], x_true[\"x_2\"], '.b')\n", "plt.plot(x_false[\"x_1\"], x_false[\"x_2\"], '+r')\n", "\n", "r = np.linspace((-1,-1), (1, 1)) * 3 * x_scatter / np.linalg.norm(slope_data)\n", "x_r = r * (np.array([[0,1],[-1,0]]) @ slope_data) - intercept_data / np.linalg.norm(slope_data)**2 * slope_data\n", "plt.plot(x_r[:,0], x_r[:,1], '--k');\n", "\n", "inferred_slope_data = predictor(trained_graph.parameters_y.location.slope)\n", "inferred_intercept_data = predictor(trained_graph.parameters_y.location.intercept)\n", "\n", "s = np.linspace((-1,-1), (1, 1)) * 3 * x_scatter / np.linalg.norm(inferred_slope_data)\n", "x_s = s * (np.array([[0,1],[-1,0]]) @ inferred_slope_data) - inferred_intercept_data / np.linalg.norm(inferred_slope_data)**2 * inferred_slope_data\n", "plt.plot(x_s[:,0], x_s[:,1], '--g');\n", "\n", "\n", "\n", "plt.xlabel(\"$x_1$\");\n", "plt.ylabel(\"$x_2$\");\n", "plt.legend([\"$y$=true\", \"$y$=false\", \"$p_{true}=1/2$\", \"inferred $p_{true}=1/2$\"]);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }