(* Content-type: application/mathematica *) (*** Wolfram Notebook File ***) (* http://www.wolfram.com/nb *) (* CreatedBy='Mathematica 6.0' *) (*CacheID: 234*) (* Internal cache information: NotebookFileLineBreakTest NotebookFileLineBreakTest NotebookDataPosition[ 145, 7] NotebookDataLength[ 11504, 394] NotebookOptionsPosition[ 9439, 332] NotebookOutlinePosition[ 9806, 348] CellTagsIndexPosition[ 9763, 345] WindowFrame->Normal*) (* Beginning of Notebook Content *) Notebook[{ Cell["\<\ Data Import and Linear Regression\ \>", "Title", CellChangeTimes->{3.455364078328125*^9}], Cell["", "Subsubtitle"], Cell["\<\ This notebook illustrates the use of imported data, in the form of \ lists/arrays, to perform least squares data fitting. In this case, we will \ create graphs to prove that you should buy TVs if you want to live longer.\ \>", "Subsubtitle"], Cell[TextData[{ "Use the ", StyleBox["Import", FontWeight->"Bold"], " function to import data from a text file -- ", StyleBox["Mathematica", FontSlant->"Italic"], " can read CSV (comma delimited text) files easily. Fill in the name of the \ file in the place of XXX. The easiest way to do this is to select \"File \ Path\" from the \"Insert\" menu; find the file TVData.csv and click \"open\"; \ the full file pathname will be entered where the cursor is." }], "Text", CellChangeTimes->{{3.45452093653125*^9, 3.454520942453125*^9}}], Cell[BoxData[ RowBox[{ RowBox[{ "dataPath", "=", "\"\\""}], ";"}]], "Input", CellChangeTimes->{ 3.454520799515625*^9, {3.4545208880625*^9, 3.454520907703125*^9}}], Cell[BoxData[ RowBox[{"tvdata", " ", "=", " ", RowBox[{"Import", "[", "dataPath", "]"}]}]], "Input"], Cell[TextData[{ "The data should now be saved as a 2D array called ", StyleBox["tvdata", FontWeight->"Bold"], ", From here you can evaluate the entire notebook to see some data \ manipulation and fitting in action." }], "Text"], Cell[TextData[{ "With the data saved in ", StyleBox["tvdata", FontWeight->"Bold"], " as a list, we can plot it using ", StyleBox["ListPlot", FontWeight->"Bold"], "." }], "Text"], Cell[BoxData[ RowBox[{"datp1", "=", RowBox[{"ListPlot", "[", "tvdata", "]"}]}]], "Input"], Cell["\<\ Looks like there's some functional relationship here (with plenty of \ real-life noise as well). Let's extract the data values. The first element \ of each data pair (the 'x' value) is the number of TVs per capita (for \ different countries); the second element ('y') is the average life expectancy \ for citizens of that country. We can think of a list as a matrix, with each data pair being a row of the \ matrix:\ \>", "Text"], Cell[BoxData[ RowBox[{"tvdata", "//", "MatrixForm"}]], "Input"], Cell[TextData[{ "So, to extract just the 'x'-values (the TVs per capita), we need the first \ column of this matrix", ". In ", StyleBox["Mathematica", FontSlant->"Italic"], ", use double square brackets to access array elements; use ", StyleBox["All", FontWeight->"Bold"], " to specify an entire row or column:" }], "Text"], Cell[BoxData[ RowBox[{"tvs", "=", RowBox[{"tvdata", "[", RowBox[{"[", RowBox[{"All", ",", "1"}], "]"}], "]"}]}]], "Input"], Cell[BoxData[ RowBox[{"life", "=", RowBox[{"tvdata", "[", RowBox[{"[", RowBox[{"All", ",", "2"}], "]"}], "]"}]}]], "Input"], Cell[TextData[{ "Now we have two new variables ", StyleBox["tvs", FontWeight->"Bold"], " and ", StyleBox["life", FontWeight->"Bold"], " that hold the two data series we plotted against each other above. From \ the graph, it looks like the data follows a logarithmic kind of relationship, \ so let's define a new x variable that is the log of ", StyleBox["tvs", FontWeight->"Bold"], ". It doesn't matter which base of log we use, so let's use the natural log \ (which in ", StyleBox["Mathematica", FontSlant->"Italic"], " is ", StyleBox["Log", FontWeight->"Bold"], "). We'll need to construct a list like our original data in order to plot, \ so let's first make a copy of the original data, then overwrite the first \ column with our transformed data:" }], "Text"], Cell[BoxData[ RowBox[{ RowBox[{"logdata", "=", "tvdata"}], ";"}]], "Input"], Cell[BoxData[ RowBox[{"logtv", " ", "=", " ", RowBox[{"Log", "[", "tvs", "]"}]}]], "Input"], Cell[BoxData[ RowBox[{ RowBox[{"logdata", "[", RowBox[{"[", RowBox[{"All", ",", "1"}], "]"}], "]"}], "=", "logtv"}]], "Input"], Cell[TextData[{ "Note how we used the double bracket notation again to specify a particular \ part of the array ", StyleBox["logdata", FontWeight->"Bold"], ". Now let's check how the whole array looks, then plot it:" }], "Text"], Cell[BoxData["logdata"], "Input"], Cell[BoxData[ RowBox[{"datp2", "=", RowBox[{"ListPlot", "[", "logdata", "]"}]}]], "Input"], Cell["\<\ Hey hey! That looks vaguely linear! We could try fitting a line to it... \ The textbook shows how to get equations for the slope and intercept of the \ best-fit line. There are various different formulations but they all require \ sums of various powers of the x and y data points.\ \>", "Text"], Cell[BoxData[ RowBox[{"n", "=", RowBox[{"Length", "[", "logtv", "]"}]}]], "Input"], Cell["Calculate the mean (average) x value", "Text"], Cell[BoxData[ RowBox[{"meanx", "=", RowBox[{ RowBox[{"Total", "[", "logtv", "]"}], "/", "n"}]}]], "Input"], Cell["Calculate the mean y value", "Text"], Cell[BoxData[ RowBox[{"meany", "=", RowBox[{ RowBox[{"Total", "[", "life", "]"}], "/", "n"}]}]], "Input"], Cell["\<\ We use this construction to calculate the variances needed in the \ least-squares formula: L(x) = intercept + slope * x.\ \>", "Text"], Cell[BoxData[ RowBox[{"sxx", " ", "=", " ", RowBox[{"Total", "[", RowBox[{ RowBox[{"(", RowBox[{"logtv", "-", "meanx"}], ")"}], "^", "2"}], "]"}]}]], "Input"], Cell[BoxData[ RowBox[{"sxy", "=", RowBox[{"Total", "[", RowBox[{ RowBox[{"(", RowBox[{"logtv", "-", "meanx"}], ")"}], "*", RowBox[{"(", RowBox[{"life", "-", "meany"}], ")"}]}], "]"}]}]], "Input"], Cell["Note that we can interpret these sums as dot products:", "Text"], Cell[BoxData[ RowBox[{"sxx", " ", "=", RowBox[{"Dot", "[", RowBox[{ RowBox[{"(", RowBox[{"logtv", "-", "meanx"}], ")"}], ",", RowBox[{"(", RowBox[{"logtv", "-", "meanx"}], ")"}]}], "]"}]}]], "Input"], Cell[BoxData[ RowBox[{"sxy", " ", "=", RowBox[{"Dot", "[", RowBox[{ RowBox[{"(", RowBox[{"logtv", "-", "meanx"}], ")"}], ",", RowBox[{"(", RowBox[{"life", "-", "meany"}], ")"}]}], "]"}]}]], "Input"], Cell["\<\ Either way, now we apply the formulae to get the coefficients of the best-fit \ line:\ \>", "Text"], Cell["Calculate the slope", "Text"], Cell[BoxData[ RowBox[{"slp", "=", RowBox[{"sxy", "/", "sxx"}]}]], "Input"], Cell["Calculate the intercept", "Text"], Cell[BoxData[ RowBox[{"int", "=", RowBox[{"meany", "-", RowBox[{"slp", "*", "meanx"}]}]}]], "Input"], Cell["\<\ So... apparently le(x) = 80.59 + 5.79 log(x) is the least squares solution \ where le is life expectancy in years and x is the number of TVs per capita. \ So, let's plot the data and the best-fit curve. First in the transformed \ data, then in original units:\ \>", "Text"], Cell["Define the regression function", "Text"], Cell[BoxData[ RowBox[{ RowBox[{"le", "[", "x_", "]"}], " ", "=", " ", RowBox[{"int", "+", " ", RowBox[{"slp", "*", RowBox[{"Log", "[", "x", "]"}]}]}]}]], "Input"], Cell[BoxData[ RowBox[{"fitp1", "=", RowBox[{"Plot", "[", RowBox[{ RowBox[{"le", "[", "x", "]"}], ",", RowBox[{"{", RowBox[{"x", ",", "0", ",", "0.8"}], "}"}]}], "]"}]}]], "Input"], Cell[BoxData[ RowBox[{"fitp2", "=", RowBox[{"Plot", "[", RowBox[{ RowBox[{"int", "+", RowBox[{"slp", "*", "x"}]}], ",", RowBox[{"{", RowBox[{"x", ",", RowBox[{"-", "5.5"}], ",", "0"}], "}"}]}], "]"}]}]], "Input"], Cell[BoxData[ RowBox[{"Show", "[", RowBox[{"datp2", ",", "fitp2", ",", RowBox[{"AxesLabel", "\[Rule]", RowBox[{"{", RowBox[{"\"\\"", ",", "\"\\""}], "}"}]}], ",", RowBox[{"ImageSize", "\[Rule]", "400"}]}], "]"}]], "Input"], Cell[BoxData[ RowBox[{"Show", "[", RowBox[{"datp1", ",", "fitp1", ",", RowBox[{"AxesLabel", "\[Rule]", RowBox[{"{", RowBox[{"\"\\"", ",", "\"\\""}], "}"}]}], ",", RowBox[{"ImageSize", "\[Rule]", "400"}]}], "]"}]], "Input"], Cell[TextData[{ "Alternatively, maybe the number of TVs you buy depends on your life \ expectancy...? It can be helpful to switch the order of the variables. This \ requires interchanging columns, which you can do with the ", StyleBox["Reverse", FontWeight->"Bold"], " function:" }], "Text"], Cell[BoxData[ RowBox[{"ledata", "=", RowBox[{"Reverse", "/@", "tvdata"}]}]], "Input"], Cell[BoxData[ RowBox[{"ListPlot", "[", RowBox[{"ledata", ",", RowBox[{"AxesLabel", "\[Rule]", RowBox[{"{", RowBox[{"\"\\"", ",", "\"\\""}], "}"}]}], ",", RowBox[{"ImageSize", "\[Rule]", "400"}]}], "]"}]], "Input"], Cell["\<\ And we could proceed with a similar analysis of the reversed data.\ \>", "Text"] }, WindowSize->{848, 720}, WindowMargins->{{Automatic, 183}, {Automatic, 5}}, ShowSelection->True, FrontEndVersion->"7.0 for Microsoft Windows (32-bit) (February 18, 2009)", StyleDefinitions->"DemoText.nb" ] (* End of Notebook Content *) (* Internal cache information *) (*CellTagsOutline CellTagsIndex->{} *) (*CellTagsIndex CellTagsIndex->{} *) (*NotebookFileOutline Notebook[{ Cell[545, 20, 100, 3, 56, "Title"], Cell[648, 25, 23, 0, 56, "Subsubtitle"], Cell[674, 27, 252, 4, 73, "Subsubtitle"], Cell[929, 33, 544, 12, 61, "Text"], Cell[1476, 47, 249, 7, 38, "Input"], Cell[1728, 56, 104, 2, 38, "Input"], Cell[1835, 60, 233, 6, 43, "Text"], Cell[2071, 68, 186, 8, 25, "Text"], Cell[2260, 78, 93, 2, 38, "Input"], Cell[2356, 82, 442, 9, 121, "Text"], Cell[2801, 93, 64, 1, 38, "Input"], Cell[2868, 96, 334, 10, 43, "Text"], Cell[3205, 108, 135, 4, 38, "Input"], Cell[3343, 114, 136, 4, 38, "Input"], Cell[3482, 120, 788, 22, 79, "Text"], Cell[4273, 144, 78, 2, 38, "Input"], Cell[4354, 148, 95, 2, 38, "Input"], Cell[4452, 152, 138, 4, 38, "Input"], Cell[4593, 158, 234, 6, 43, "Text"], Cell[4830, 166, 33, 0, 38, "Input"], Cell[4866, 168, 94, 2, 38, "Input"], Cell[4963, 172, 309, 5, 43, "Text"], Cell[5275, 179, 86, 2, 38, "Input"], Cell[5364, 183, 52, 0, 25, "Text"], Cell[5419, 185, 113, 3, 38, "Input"], Cell[5535, 190, 42, 0, 25, "Text"], Cell[5580, 192, 112, 3, 38, "Input"], Cell[5695, 197, 144, 3, 25, "Text"], Cell[5842, 202, 176, 5, 38, "Input"], Cell[6021, 209, 224, 7, 38, "Input"], Cell[6248, 218, 70, 0, 25, "Text"], Cell[6321, 220, 228, 7, 38, "Input"], Cell[6552, 229, 227, 7, 38, "Input"], Cell[6782, 238, 109, 3, 25, "Text"], Cell[6894, 243, 35, 0, 25, "Text"], Cell[6932, 245, 78, 2, 38, "Input"], Cell[7013, 249, 39, 0, 25, "Text"], Cell[7055, 251, 108, 3, 38, "Input"], Cell[7166, 256, 285, 5, 43, "Text"], Cell[7454, 263, 46, 0, 25, "Text"], Cell[7503, 265, 177, 5, 38, "Input"], Cell[7683, 272, 202, 6, 38, "Input"], Cell[7888, 280, 248, 8, 38, "Input"], Cell[8139, 290, 278, 7, 38, "Input"], Cell[8420, 299, 269, 6, 38, "Input"], Cell[8692, 307, 298, 7, 43, "Text"], Cell[8993, 316, 89, 2, 38, "Input"], Cell[9085, 320, 257, 6, 38, "Input"], Cell[9345, 328, 90, 2, 25, "Text"] } ] *) (* End of internal cache information *)