632 lines
18 KiB
Plaintext
632 lines
18 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "bd4b5db9-6439-4519-aef0-c8bb8ffd6a13",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "a3c4bf54-07f7-46bb-9ae5-77c3a4518627",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Basics"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "def1dfe3-6afe-4f5e-9714-36b65811094a",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" cars passings\n",
|
|
"0 BMW 3\n",
|
|
"1 Volvo 7\n",
|
|
"2 Ford 2\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"mydataset = {\n",
|
|
" 'cars': [\"BMW\", \"Volvo\", \"Ford\"],\n",
|
|
" 'passings': [3, 7, 2]\n",
|
|
"}\n",
|
|
"\n",
|
|
"myvar = pd.DataFrame(mydataset)\n",
|
|
"\n",
|
|
"print(myvar)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "0785d9bd-e5b1-4221-84e9-1a1a6a2cf37e",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"0 1\n",
|
|
"1 7\n",
|
|
"2 2\n",
|
|
"dtype: int64\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# series: numpy arrays!\n",
|
|
"a = [1, 7, 2]\n",
|
|
"\n",
|
|
"myvar = pd.Series(a)\n",
|
|
"\n",
|
|
"print(myvar)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "2d7a5f33-79a2-4e2a-9b14-eee43db90662",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"x 1\n",
|
|
"y 7\n",
|
|
"z 2\n",
|
|
"dtype: int64\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# labels\n",
|
|
"a = [1, 7, 2]\n",
|
|
"\n",
|
|
"myvar = pd.Series(a, index = [\"x\", \"y\", \"z\"])\n",
|
|
"\n",
|
|
"print(myvar)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "cba4f2ef-45d1-4c23-94ce-7197c43d4aee",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"day1 420\n",
|
|
"day2 380\n",
|
|
"day3 390\n",
|
|
"dtype: int64\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# key value objects\n",
|
|
"calories = {\"day1\": 420, \"day2\": 380, \"day3\": 390}\n",
|
|
"\n",
|
|
"myvar = pd.Series(calories)\n",
|
|
"\n",
|
|
"print(myvar)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "e965971f-6a97-42c4-81bf-2550e57cd7e9",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" calories duration\n",
|
|
"0 420 50\n",
|
|
"1 380 40\n",
|
|
"2 390 45\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# dataframe = multi-dimensional tables\n",
|
|
"data = {\n",
|
|
" \"calories\": [420, 380, 390],\n",
|
|
" \"duration\": [50, 40, 45]\n",
|
|
"}\n",
|
|
"\n",
|
|
"df = pd.DataFrame(data)\n",
|
|
"\n",
|
|
"print(df)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "a6650272-40db-4237-b2d6-83d3652184c2",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"calories 420\n",
|
|
"duration 50\n",
|
|
"Name: 0, dtype: int64\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# locate row\n",
|
|
"print(df.loc[0])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "694a5a9b-c280-454c-b7c6-236eda33d8f4",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" calories duration\n",
|
|
"0 420 50\n",
|
|
"1 380 40\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"#use a list of indexes:\n",
|
|
"print(df.loc[[0, 1]])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"id": "e3ff6022-327e-446d-a739-011730d1db8a",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" calories duration\n",
|
|
"day1 420 50\n",
|
|
"day2 380 40\n",
|
|
"day3 390 45\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# named index\n",
|
|
"data = {\n",
|
|
" \"calories\": [420, 380, 390],\n",
|
|
" \"duration\": [50, 40, 45]\n",
|
|
"}\n",
|
|
"\n",
|
|
"df = pd.DataFrame(data, index = [\"day1\", \"day2\", \"day3\"])\n",
|
|
"\n",
|
|
"print(df) "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"id": "4d8a50a4-7c1d-4ef7-ab0b-09cbc8085ad6",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"calories 380\n",
|
|
"duration 40\n",
|
|
"Name: day2, dtype: int64\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"#refer to the named index:\n",
|
|
"print(df.loc[\"day2\"])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "b18cee34-c5dd-43df-9e2e-bd20ebeed8ed",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Loading Files"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"id": "abef3aeb-d82c-4cc5-a2d2-7a70313f0712",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" Duration Pulse Maxpulse Calories\n",
|
|
"0 60 110 130 409.1\n",
|
|
"1 60 117 145 479.0\n",
|
|
"2 60 103 135 340.0\n",
|
|
"3 45 109 175 282.4\n",
|
|
"4 45 117 148 406.0\n",
|
|
"5 60 102 127 300.0\n",
|
|
"6 60 110 136 374.0\n",
|
|
"7 45 104 134 253.3\n",
|
|
"8 30 109 133 195.1\n",
|
|
"9 60 98 124 269.0\n",
|
|
"10 60 103 147 329.3\n",
|
|
"11 60 100 120 250.7\n",
|
|
"12 60 106 128 345.3\n",
|
|
"13 60 104 132 379.3\n",
|
|
"14 60 98 123 275.0\n",
|
|
"15 60 98 120 215.2\n",
|
|
"16 60 100 120 300.0\n",
|
|
"17 45 90 112 NaN\n",
|
|
"18 60 103 123 323.0\n",
|
|
"19 45 97 125 243.0\n",
|
|
"20 60 108 131 364.2\n",
|
|
"21 45 100 119 282.0\n",
|
|
"22 60 130 101 300.0\n",
|
|
"23 45 105 132 246.0\n",
|
|
"24 60 102 126 334.5\n",
|
|
"25 60 100 120 250.0\n",
|
|
"26 60 92 118 241.0\n",
|
|
"27 60 103 132 NaN\n",
|
|
"28 60 100 132 280.0\n",
|
|
"29 60 102 129 380.3\n",
|
|
"30 60 92 115 243.0\n",
|
|
"31 45 90 112 180.1\n",
|
|
"32 60 101 124 299.0\n",
|
|
"33 60 93 113 223.0\n",
|
|
"34 60 107 136 361.0\n",
|
|
"35 60 114 140 415.0\n",
|
|
"36 60 102 127 300.0\n",
|
|
"37 60 100 120 300.0\n",
|
|
"38 60 100 120 300.0\n",
|
|
"39 45 104 129 266.0\n",
|
|
"40 45 90 112 180.1\n",
|
|
"41 60 98 126 286.0\n",
|
|
"42 60 100 122 329.4\n",
|
|
"43 60 111 138 400.0\n",
|
|
"44 60 111 131 397.0\n",
|
|
"45 60 99 119 273.0\n",
|
|
"46 60 109 153 387.6\n",
|
|
"47 45 111 136 300.0\n",
|
|
"48 45 108 129 298.0\n",
|
|
"49 60 111 139 397.6\n",
|
|
"50 60 107 136 380.2\n",
|
|
"51 80 123 146 643.1\n",
|
|
"52 60 106 130 263.0\n",
|
|
"53 60 118 151 486.0\n",
|
|
"54 30 136 175 238.0\n",
|
|
"55 60 121 146 450.7\n",
|
|
"56 60 118 121 413.0\n",
|
|
"57 45 115 144 305.0\n",
|
|
"58 20 153 172 226.4\n",
|
|
"59 45 123 152 321.0\n",
|
|
"60 210 108 160 1376.0\n",
|
|
"61 160 110 137 1034.4\n",
|
|
"62 160 109 135 853.0\n",
|
|
"63 45 118 141 341.0\n",
|
|
"64 20 110 130 131.4\n",
|
|
"65 180 90 130 800.4\n",
|
|
"66 150 105 135 873.4\n",
|
|
"67 150 107 130 816.0\n",
|
|
"68 20 106 136 110.4\n",
|
|
"69 300 108 143 1500.2\n",
|
|
"70 150 97 129 1115.0\n",
|
|
"71 60 109 153 387.6\n",
|
|
"72 90 100 127 700.0\n",
|
|
"73 150 97 127 953.2\n",
|
|
"74 45 114 146 304.0\n",
|
|
"75 90 98 125 563.2\n",
|
|
"76 45 105 134 251.0\n",
|
|
"77 45 110 141 300.0\n",
|
|
"78 120 100 130 500.4\n",
|
|
"79 270 100 131 1729.0\n",
|
|
"80 30 159 182 319.2\n",
|
|
"81 45 149 169 344.0\n",
|
|
"82 30 103 139 151.1\n",
|
|
"83 120 100 130 500.0\n",
|
|
"84 45 100 120 225.3\n",
|
|
"85 30 151 170 300.0\n",
|
|
"86 45 102 136 234.0\n",
|
|
"87 120 100 157 1000.1\n",
|
|
"88 45 129 103 242.0\n",
|
|
"89 20 83 107 50.3\n",
|
|
"90 180 101 127 600.1\n",
|
|
"91 45 107 137 NaN\n",
|
|
"92 30 90 107 105.3\n",
|
|
"93 15 80 100 50.5\n",
|
|
"94 20 150 171 127.4\n",
|
|
"95 20 151 168 229.4\n",
|
|
"96 30 95 128 128.2\n",
|
|
"97 25 152 168 244.2\n",
|
|
"98 30 109 131 188.2\n",
|
|
"99 90 93 124 604.1\n",
|
|
"100 20 95 112 77.7\n",
|
|
"101 90 90 110 500.0\n",
|
|
"102 90 90 100 500.0\n",
|
|
"103 90 90 100 500.4\n",
|
|
"104 30 92 108 92.7\n",
|
|
"105 30 93 128 124.0\n",
|
|
"106 180 90 120 800.3\n",
|
|
"107 30 90 120 86.2\n",
|
|
"108 90 90 120 500.3\n",
|
|
"109 210 137 184 1860.4\n",
|
|
"110 60 102 124 325.2\n",
|
|
"111 45 107 124 275.0\n",
|
|
"112 15 124 139 124.2\n",
|
|
"113 45 100 120 225.3\n",
|
|
"114 60 108 131 367.6\n",
|
|
"115 60 108 151 351.7\n",
|
|
"116 60 116 141 443.0\n",
|
|
"117 60 97 122 277.4\n",
|
|
"118 60 105 125 NaN\n",
|
|
"119 60 103 124 332.7\n",
|
|
"120 30 112 137 193.9\n",
|
|
"121 45 100 120 100.7\n",
|
|
"122 60 119 169 336.7\n",
|
|
"123 60 107 127 344.9\n",
|
|
"124 60 111 151 368.5\n",
|
|
"125 60 98 122 271.0\n",
|
|
"126 60 97 124 275.3\n",
|
|
"127 60 109 127 382.0\n",
|
|
"128 90 99 125 466.4\n",
|
|
"129 60 114 151 384.0\n",
|
|
"130 60 104 134 342.5\n",
|
|
"131 60 107 138 357.5\n",
|
|
"132 60 103 133 335.0\n",
|
|
"133 60 106 132 327.5\n",
|
|
"134 60 103 136 339.0\n",
|
|
"135 20 136 156 189.0\n",
|
|
"136 45 117 143 317.7\n",
|
|
"137 45 115 137 318.0\n",
|
|
"138 45 113 138 308.0\n",
|
|
"139 20 141 162 222.4\n",
|
|
"140 60 108 135 390.0\n",
|
|
"141 60 97 127 NaN\n",
|
|
"142 45 100 120 250.4\n",
|
|
"143 45 122 149 335.4\n",
|
|
"144 60 136 170 470.2\n",
|
|
"145 45 106 126 270.8\n",
|
|
"146 60 107 136 400.0\n",
|
|
"147 60 112 146 361.9\n",
|
|
"148 30 103 127 185.0\n",
|
|
"149 60 110 150 409.4\n",
|
|
"150 60 106 134 343.0\n",
|
|
"151 60 109 129 353.2\n",
|
|
"152 60 109 138 374.0\n",
|
|
"153 30 150 167 275.8\n",
|
|
"154 60 105 128 328.0\n",
|
|
"155 60 111 151 368.5\n",
|
|
"156 60 97 131 270.4\n",
|
|
"157 60 100 120 270.4\n",
|
|
"158 60 114 150 382.8\n",
|
|
"159 30 80 120 240.9\n",
|
|
"160 30 85 120 250.4\n",
|
|
"161 45 90 130 260.4\n",
|
|
"162 45 95 130 270.0\n",
|
|
"163 45 100 140 280.9\n",
|
|
"164 60 105 140 290.8\n",
|
|
"165 60 110 145 300.0\n",
|
|
"166 60 115 145 310.2\n",
|
|
"167 75 120 150 320.4\n",
|
|
"168 75 125 150 330.4\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"#https://www.w3schools.com/python/pandas/data.csv\n",
|
|
"df = pd.read_csv('https://www.w3schools.com/python/pandas/data.csv')\n",
|
|
"\n",
|
|
"print(df.to_string()) "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "1dba1828-de88-4a61-868b-5a8b638254f0",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Analyze Data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"id": "e89c6f79-b957-4c6b-85bc-c76b362d7a2a",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"(169, 4)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(df.shape)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"id": "460724ee-1ba6-461c-801e-46632a68c837",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|
"RangeIndex: 169 entries, 0 to 168\n",
|
|
"Data columns (total 4 columns):\n",
|
|
" # Column Non-Null Count Dtype \n",
|
|
"--- ------ -------------- ----- \n",
|
|
" 0 Duration 169 non-null int64 \n",
|
|
" 1 Pulse 169 non-null int64 \n",
|
|
" 2 Maxpulse 169 non-null int64 \n",
|
|
" 3 Calories 164 non-null float64\n",
|
|
"dtypes: float64(1), int64(3)\n",
|
|
"memory usage: 5.4 KB\n",
|
|
"None\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(df.info()) "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"id": "418bbc3e-53ff-4ea2-9728-1b8aa94a140d",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" Duration Pulse Maxpulse Calories\n",
|
|
"0 60 110 130 409.1\n",
|
|
"1 60 117 145 479.0\n",
|
|
"2 60 103 135 340.0\n",
|
|
"3 45 109 175 282.4\n",
|
|
"4 45 117 148 406.0\n",
|
|
"5 60 102 127 300.0\n",
|
|
"6 60 110 136 374.0\n",
|
|
"7 45 104 134 253.3\n",
|
|
"8 30 109 133 195.1\n",
|
|
"9 60 98 124 269.0\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(df.head(10))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"id": "b7424301-9bb4-4200-b2d5-bcc8a22c6427",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" Duration Pulse Maxpulse Calories\n",
|
|
"0 60 110 130 409.1\n",
|
|
"1 60 117 145 479.0\n",
|
|
"2 60 103 135 340.0\n",
|
|
"3 45 109 175 282.4\n",
|
|
"4 45 117 148 406.0\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(df.head())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"id": "a7649f71-6c14-4158-8040-08019577b1ad",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" Duration Pulse Maxpulse Calories\n",
|
|
"164 60 105 140 290.8\n",
|
|
"165 60 110 145 300.0\n",
|
|
"166 60 115 145 310.2\n",
|
|
"167 75 120 150 320.4\n",
|
|
"168 75 125 150 330.4\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(df.tail())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"id": "8da3696c-6a3c-4727-8cea-6442d7cedf28",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"count 169.000000\n",
|
|
"mean 107.461538\n",
|
|
"std 14.510259\n",
|
|
"min 80.000000\n",
|
|
"25% 100.000000\n",
|
|
"50% 105.000000\n",
|
|
"75% 111.000000\n",
|
|
"max 159.000000\n",
|
|
"Name: Pulse, dtype: float64\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(df['Pulse'].describe())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "8689d23c-5d68-4fdb-bcef-9ae080442a27",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.8"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|