Technologische_Grundlagen/course/pandas/01_basics.ipynb

746 lines
22 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "bd4b5db9-6439-4519-aef0-c8bb8ffd6a13",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"id": "a3c4bf54-07f7-46bb-9ae5-77c3a4518627",
"metadata": {},
"source": [
"### Basics"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "def1dfe3-6afe-4f5e-9714-36b65811094a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" cars passings\n",
"0 BMW 3\n",
"1 Volvo 7\n",
"2 Ford 2\n"
]
}
],
"source": [
"mydataset = {\n",
" 'cars': [\"BMW\", \"Volvo\", \"Ford\"],\n",
" 'passings': [3, 7, 2]\n",
"}\n",
"\n",
"myvar = pd.DataFrame(mydataset)\n",
"\n",
"print(myvar)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "0785d9bd-e5b1-4221-84e9-1a1a6a2cf37e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 1\n",
"1 7\n",
"2 2\n",
"dtype: int64\n"
]
}
],
"source": [
"# series: numpy arrays!\n",
"a = [1, 7, 2]\n",
"\n",
"myvar = pd.Series(a)\n",
"\n",
"print(myvar)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "2d7a5f33-79a2-4e2a-9b14-eee43db90662",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"x 1\n",
"y 7\n",
"z 2\n",
"dtype: int64\n"
]
}
],
"source": [
"# labels\n",
"a = [1, 7, 2]\n",
"\n",
"myvar = pd.Series(a, index = [\"x\", \"y\", \"z\"])\n",
"\n",
"print(myvar)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "cba4f2ef-45d1-4c23-94ce-7197c43d4aee",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"day1 420\n",
"day2 380\n",
"day3 390\n",
"dtype: int64\n"
]
}
],
"source": [
"# key value objects\n",
"calories = {\"day1\": 420, \"day2\": 380, \"day3\": 390}\n",
"\n",
"myvar = pd.Series(calories)\n",
"\n",
"print(myvar)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e965971f-6a97-42c4-81bf-2550e57cd7e9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" calories duration\n",
"0 420 50\n",
"1 380 40\n",
"2 390 45\n"
]
}
],
"source": [
"# dataframe = multi-dimensional tables\n",
"data = {\n",
" \"calories\": [420, 380, 390],\n",
" \"duration\": [50, 40, 45]\n",
"}\n",
"\n",
"df = pd.DataFrame(data)\n",
"\n",
"print(df)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "a6650272-40db-4237-b2d6-83d3652184c2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"calories 420\n",
"duration 50\n",
"Name: 0, dtype: int64\n"
]
}
],
"source": [
"# locate row\n",
"print(df.loc[0])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "694a5a9b-c280-454c-b7c6-236eda33d8f4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" calories duration\n",
"0 420 50\n",
"1 380 40\n"
]
}
],
"source": [
"#use a list of indexes:\n",
"print(df.loc[[0, 1]])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "e3ff6022-327e-446d-a739-011730d1db8a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" calories duration\n",
"day1 420 50\n",
"day2 380 40\n",
"day3 390 45\n"
]
}
],
"source": [
"# named index\n",
"data = {\n",
" \"calories\": [420, 380, 390],\n",
" \"duration\": [50, 40, 45]\n",
"}\n",
"\n",
"df = pd.DataFrame(data, index = [\"day1\", \"day2\", \"day3\"])\n",
"\n",
"print(df) "
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "4d8a50a4-7c1d-4ef7-ab0b-09cbc8085ad6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"calories 380\n",
"duration 40\n",
"Name: day2, dtype: int64\n"
]
}
],
"source": [
"#refer to the named index:\n",
"print(df.loc[\"day2\"])"
]
},
{
"cell_type": "markdown",
"id": "b18cee34-c5dd-43df-9e2e-bd20ebeed8ed",
"metadata": {},
"source": [
"### Loading Files"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "abef3aeb-d82c-4cc5-a2d2-7a70313f0712",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Duration Pulse Maxpulse Calories\n",
"0 60 110 130 409.1\n",
"1 60 117 145 479.0\n",
"2 60 103 135 340.0\n",
"3 45 109 175 282.4\n",
"4 45 117 148 406.0\n",
"5 60 102 127 300.0\n",
"6 60 110 136 374.0\n",
"7 45 104 134 253.3\n",
"8 30 109 133 195.1\n",
"9 60 98 124 269.0\n",
"10 60 103 147 329.3\n",
"11 60 100 120 250.7\n",
"12 60 106 128 345.3\n",
"13 60 104 132 379.3\n",
"14 60 98 123 275.0\n",
"15 60 98 120 215.2\n",
"16 60 100 120 300.0\n",
"17 45 90 112 NaN\n",
"18 60 103 123 323.0\n",
"19 45 97 125 243.0\n",
"20 60 108 131 364.2\n",
"21 45 100 119 282.0\n",
"22 60 130 101 300.0\n",
"23 45 105 132 246.0\n",
"24 60 102 126 334.5\n",
"25 60 100 120 250.0\n",
"26 60 92 118 241.0\n",
"27 60 103 132 NaN\n",
"28 60 100 132 280.0\n",
"29 60 102 129 380.3\n",
"30 60 92 115 243.0\n",
"31 45 90 112 180.1\n",
"32 60 101 124 299.0\n",
"33 60 93 113 223.0\n",
"34 60 107 136 361.0\n",
"35 60 114 140 415.0\n",
"36 60 102 127 300.0\n",
"37 60 100 120 300.0\n",
"38 60 100 120 300.0\n",
"39 45 104 129 266.0\n",
"40 45 90 112 180.1\n",
"41 60 98 126 286.0\n",
"42 60 100 122 329.4\n",
"43 60 111 138 400.0\n",
"44 60 111 131 397.0\n",
"45 60 99 119 273.0\n",
"46 60 109 153 387.6\n",
"47 45 111 136 300.0\n",
"48 45 108 129 298.0\n",
"49 60 111 139 397.6\n",
"50 60 107 136 380.2\n",
"51 80 123 146 643.1\n",
"52 60 106 130 263.0\n",
"53 60 118 151 486.0\n",
"54 30 136 175 238.0\n",
"55 60 121 146 450.7\n",
"56 60 118 121 413.0\n",
"57 45 115 144 305.0\n",
"58 20 153 172 226.4\n",
"59 45 123 152 321.0\n",
"60 210 108 160 1376.0\n",
"61 160 110 137 1034.4\n",
"62 160 109 135 853.0\n",
"63 45 118 141 341.0\n",
"64 20 110 130 131.4\n",
"65 180 90 130 800.4\n",
"66 150 105 135 873.4\n",
"67 150 107 130 816.0\n",
"68 20 106 136 110.4\n",
"69 300 108 143 1500.2\n",
"70 150 97 129 1115.0\n",
"71 60 109 153 387.6\n",
"72 90 100 127 700.0\n",
"73 150 97 127 953.2\n",
"74 45 114 146 304.0\n",
"75 90 98 125 563.2\n",
"76 45 105 134 251.0\n",
"77 45 110 141 300.0\n",
"78 120 100 130 500.4\n",
"79 270 100 131 1729.0\n",
"80 30 159 182 319.2\n",
"81 45 149 169 344.0\n",
"82 30 103 139 151.1\n",
"83 120 100 130 500.0\n",
"84 45 100 120 225.3\n",
"85 30 151 170 300.0\n",
"86 45 102 136 234.0\n",
"87 120 100 157 1000.1\n",
"88 45 129 103 242.0\n",
"89 20 83 107 50.3\n",
"90 180 101 127 600.1\n",
"91 45 107 137 NaN\n",
"92 30 90 107 105.3\n",
"93 15 80 100 50.5\n",
"94 20 150 171 127.4\n",
"95 20 151 168 229.4\n",
"96 30 95 128 128.2\n",
"97 25 152 168 244.2\n",
"98 30 109 131 188.2\n",
"99 90 93 124 604.1\n",
"100 20 95 112 77.7\n",
"101 90 90 110 500.0\n",
"102 90 90 100 500.0\n",
"103 90 90 100 500.4\n",
"104 30 92 108 92.7\n",
"105 30 93 128 124.0\n",
"106 180 90 120 800.3\n",
"107 30 90 120 86.2\n",
"108 90 90 120 500.3\n",
"109 210 137 184 1860.4\n",
"110 60 102 124 325.2\n",
"111 45 107 124 275.0\n",
"112 15 124 139 124.2\n",
"113 45 100 120 225.3\n",
"114 60 108 131 367.6\n",
"115 60 108 151 351.7\n",
"116 60 116 141 443.0\n",
"117 60 97 122 277.4\n",
"118 60 105 125 NaN\n",
"119 60 103 124 332.7\n",
"120 30 112 137 193.9\n",
"121 45 100 120 100.7\n",
"122 60 119 169 336.7\n",
"123 60 107 127 344.9\n",
"124 60 111 151 368.5\n",
"125 60 98 122 271.0\n",
"126 60 97 124 275.3\n",
"127 60 109 127 382.0\n",
"128 90 99 125 466.4\n",
"129 60 114 151 384.0\n",
"130 60 104 134 342.5\n",
"131 60 107 138 357.5\n",
"132 60 103 133 335.0\n",
"133 60 106 132 327.5\n",
"134 60 103 136 339.0\n",
"135 20 136 156 189.0\n",
"136 45 117 143 317.7\n",
"137 45 115 137 318.0\n",
"138 45 113 138 308.0\n",
"139 20 141 162 222.4\n",
"140 60 108 135 390.0\n",
"141 60 97 127 NaN\n",
"142 45 100 120 250.4\n",
"143 45 122 149 335.4\n",
"144 60 136 170 470.2\n",
"145 45 106 126 270.8\n",
"146 60 107 136 400.0\n",
"147 60 112 146 361.9\n",
"148 30 103 127 185.0\n",
"149 60 110 150 409.4\n",
"150 60 106 134 343.0\n",
"151 60 109 129 353.2\n",
"152 60 109 138 374.0\n",
"153 30 150 167 275.8\n",
"154 60 105 128 328.0\n",
"155 60 111 151 368.5\n",
"156 60 97 131 270.4\n",
"157 60 100 120 270.4\n",
"158 60 114 150 382.8\n",
"159 30 80 120 240.9\n",
"160 30 85 120 250.4\n",
"161 45 90 130 260.4\n",
"162 45 95 130 270.0\n",
"163 45 100 140 280.9\n",
"164 60 105 140 290.8\n",
"165 60 110 145 300.0\n",
"166 60 115 145 310.2\n",
"167 75 120 150 320.4\n",
"168 75 125 150 330.4\n"
]
}
],
"source": [
"#https://www.w3schools.com/python/pandas/data.csv\n",
"df = pd.read_csv('https://www.w3schools.com/python/pandas/data.csv')\n",
"\n",
"print(df.to_string()) "
]
},
{
"cell_type": "markdown",
"id": "1dba1828-de88-4a61-868b-5a8b638254f0",
"metadata": {},
"source": [
"#### Analyze Data"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "e89c6f79-b957-4c6b-85bc-c76b362d7a2a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(169, 4)\n"
]
}
],
"source": [
"print(df.shape)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "460724ee-1ba6-461c-801e-46632a68c837",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 169 entries, 0 to 168\n",
"Data columns (total 4 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Duration 169 non-null int64 \n",
" 1 Pulse 169 non-null int64 \n",
" 2 Maxpulse 169 non-null int64 \n",
" 3 Calories 164 non-null float64\n",
"dtypes: float64(1), int64(3)\n",
"memory usage: 5.4 KB\n",
"None\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Duration</th>\n",
" <th>Pulse</th>\n",
" <th>Maxpulse</th>\n",
" <th>Calories</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>169.000000</td>\n",
" <td>169.000000</td>\n",
" <td>169.000000</td>\n",
" <td>164.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>63.846154</td>\n",
" <td>107.461538</td>\n",
" <td>134.047337</td>\n",
" <td>375.790244</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>42.299949</td>\n",
" <td>14.510259</td>\n",
" <td>16.450434</td>\n",
" <td>266.379919</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>15.000000</td>\n",
" <td>80.000000</td>\n",
" <td>100.000000</td>\n",
" <td>50.300000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>45.000000</td>\n",
" <td>100.000000</td>\n",
" <td>124.000000</td>\n",
" <td>250.925000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>60.000000</td>\n",
" <td>105.000000</td>\n",
" <td>131.000000</td>\n",
" <td>318.600000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>60.000000</td>\n",
" <td>111.000000</td>\n",
" <td>141.000000</td>\n",
" <td>387.600000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>300.000000</td>\n",
" <td>159.000000</td>\n",
" <td>184.000000</td>\n",
" <td>1860.400000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Duration Pulse Maxpulse Calories\n",
"count 169.000000 169.000000 169.000000 164.000000\n",
"mean 63.846154 107.461538 134.047337 375.790244\n",
"std 42.299949 14.510259 16.450434 266.379919\n",
"min 15.000000 80.000000 100.000000 50.300000\n",
"25% 45.000000 100.000000 124.000000 250.925000\n",
"50% 60.000000 105.000000 131.000000 318.600000\n",
"75% 60.000000 111.000000 141.000000 387.600000\n",
"max 300.000000 159.000000 184.000000 1860.400000"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(df.info()) \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c3075b2-8160-4987-a014-050da0c374bc",
"metadata": {},
"outputs": [],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "418bbc3e-53ff-4ea2-9728-1b8aa94a140d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Duration Pulse Maxpulse Calories\n",
"0 60 110 130 409.1\n",
"1 60 117 145 479.0\n",
"2 60 103 135 340.0\n",
"3 45 109 175 282.4\n",
"4 45 117 148 406.0\n",
"5 60 102 127 300.0\n",
"6 60 110 136 374.0\n",
"7 45 104 134 253.3\n",
"8 30 109 133 195.1\n",
"9 60 98 124 269.0\n"
]
}
],
"source": [
"print(df.head(10))"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "b7424301-9bb4-4200-b2d5-bcc8a22c6427",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Duration Pulse Maxpulse Calories\n",
"0 60 110 130 409.1\n",
"1 60 117 145 479.0\n",
"2 60 103 135 340.0\n",
"3 45 109 175 282.4\n",
"4 45 117 148 406.0\n"
]
}
],
"source": [
"print(df.head())"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "a7649f71-6c14-4158-8040-08019577b1ad",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Duration Pulse Maxpulse Calories\n",
"164 60 105 140 290.8\n",
"165 60 110 145 300.0\n",
"166 60 115 145 310.2\n",
"167 75 120 150 320.4\n",
"168 75 125 150 330.4\n"
]
}
],
"source": [
"print(df.tail())"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "8da3696c-6a3c-4727-8cea-6442d7cedf28",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"count 169.000000\n",
"mean 107.461538\n",
"std 14.510259\n",
"min 80.000000\n",
"25% 100.000000\n",
"50% 105.000000\n",
"75% 111.000000\n",
"max 159.000000\n",
"Name: Pulse, dtype: float64\n"
]
}
],
"source": [
"print(df['Pulse'].describe())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8689d23c-5d68-4fdb-bcef-9ae080442a27",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}