Technologische_Grundlagen/course/pandas/02_cleaning.ipynb

1260 lines
45 KiB
Plaintext
Raw Permalink Normal View History

2024-09-27 07:00:19 +00:00
{
"cells": [
{
"cell_type": "markdown",
"id": "b0c0ae08-2fb5-47f5-a5ce-1a66e35791a4",
"metadata": {},
"source": [
"### Cleaning Data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "f9998a78-ae01-4531-b325-637b6d5ee86d",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "9516a86a-ed6a-4f79-b631-3195daec258c",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('https://gist.githubusercontent.com/maltegrosse/bdfd2c6a5e3bff315d92cd27c2461a48/raw/49d5672953360934601b3d252c9b78121eed10db/data.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ea25a32c-70d3-479d-8d11-7e487f13f50c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Duration</th>\n",
" <th>Date</th>\n",
" <th>Pulse</th>\n",
" <th>Maxpulse</th>\n",
" <th>Calories</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>60</td>\n",
" <td>'2020/12/01'</td>\n",
" <td>110</td>\n",
" <td>130</td>\n",
" <td>409.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>60</td>\n",
" <td>'2020/12/02'</td>\n",
" <td>117</td>\n",
" <td>145</td>\n",
" <td>479.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>60</td>\n",
" <td>'2020/12/03'</td>\n",
" <td>103</td>\n",
" <td>135</td>\n",
" <td>340.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>45</td>\n",
" <td>'2020/12/04'</td>\n",
" <td>109</td>\n",
" <td>175</td>\n",
" <td>282.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>45</td>\n",
" <td>'2020/12/05'</td>\n",
" <td>117</td>\n",
" <td>148</td>\n",
" <td>406.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>60</td>\n",
" <td>'2020/12/06'</td>\n",
" <td>102</td>\n",
" <td>127</td>\n",
" <td>300.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>60</td>\n",
" <td>'2020/12/07'</td>\n",
" <td>110</td>\n",
" <td>136</td>\n",
" <td>374.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>450</td>\n",
" <td>'2020/12/08'</td>\n",
" <td>104</td>\n",
" <td>134</td>\n",
" <td>253.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>30</td>\n",
" <td>'2020/12/09'</td>\n",
" <td>109</td>\n",
" <td>133</td>\n",
" <td>195.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>60</td>\n",
" <td>'2020/12/10'</td>\n",
" <td>98</td>\n",
" <td>124</td>\n",
" <td>269.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>60</td>\n",
" <td>'2020/12/11'</td>\n",
" <td>103</td>\n",
" <td>147</td>\n",
" <td>329.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>60</td>\n",
" <td>'2020/12/12'</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>60</td>\n",
" <td>'2020/12/12'</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>60</td>\n",
" <td>'2020/12/13'</td>\n",
" <td>106</td>\n",
" <td>128</td>\n",
" <td>345.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>60</td>\n",
" <td>'2020/12/14'</td>\n",
" <td>104</td>\n",
" <td>132</td>\n",
" <td>379.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>60</td>\n",
" <td>'2020/12/15'</td>\n",
" <td>98</td>\n",
" <td>123</td>\n",
" <td>275.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>60</td>\n",
" <td>'2020/12/16'</td>\n",
" <td>98</td>\n",
" <td>120</td>\n",
" <td>215.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>60</td>\n",
" <td>'2020/12/17'</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>300.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>45</td>\n",
" <td>'2020/12/18'</td>\n",
" <td>90</td>\n",
" <td>112</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>60</td>\n",
" <td>'2020/12/19'</td>\n",
" <td>103</td>\n",
" <td>123</td>\n",
" <td>323.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>45</td>\n",
" <td>'2020/12/20'</td>\n",
" <td>97</td>\n",
" <td>125</td>\n",
" <td>243.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>60</td>\n",
" <td>'2020/12/21'</td>\n",
" <td>108</td>\n",
" <td>131</td>\n",
" <td>364.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>45</td>\n",
" <td>NaN</td>\n",
" <td>100</td>\n",
" <td>119</td>\n",
" <td>282.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>60</td>\n",
" <td>'2020/12/23'</td>\n",
" <td>130</td>\n",
" <td>101</td>\n",
" <td>300.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>45</td>\n",
" <td>'2020/12/24'</td>\n",
" <td>105</td>\n",
" <td>132</td>\n",
" <td>246.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>60</td>\n",
" <td>'2020/12/25'</td>\n",
" <td>102</td>\n",
" <td>126</td>\n",
" <td>334.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>60</td>\n",
" <td>20201226</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>60</td>\n",
" <td>'2020/12/27'</td>\n",
" <td>92</td>\n",
" <td>118</td>\n",
" <td>241.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>60</td>\n",
" <td>'2020/12/28'</td>\n",
" <td>103</td>\n",
" <td>132</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>60</td>\n",
" <td>'2020/12/29'</td>\n",
" <td>100</td>\n",
" <td>132</td>\n",
" <td>280.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>60</td>\n",
" <td>'2020/12/30'</td>\n",
" <td>102</td>\n",
" <td>129</td>\n",
" <td>380.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>60</td>\n",
" <td>'2020/12/31'</td>\n",
" <td>92</td>\n",
" <td>115</td>\n",
" <td>243.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Duration Date Pulse Maxpulse Calories\n",
"0 60 '2020/12/01' 110 130 409.1\n",
"1 60 '2020/12/02' 117 145 479.0\n",
"2 60 '2020/12/03' 103 135 340.0\n",
"3 45 '2020/12/04' 109 175 282.4\n",
"4 45 '2020/12/05' 117 148 406.0\n",
"5 60 '2020/12/06' 102 127 300.0\n",
"6 60 '2020/12/07' 110 136 374.0\n",
"7 450 '2020/12/08' 104 134 253.3\n",
"8 30 '2020/12/09' 109 133 195.1\n",
"9 60 '2020/12/10' 98 124 269.0\n",
"10 60 '2020/12/11' 103 147 329.3\n",
"11 60 '2020/12/12' 100 120 250.7\n",
"12 60 '2020/12/12' 100 120 250.7\n",
"13 60 '2020/12/13' 106 128 345.3\n",
"14 60 '2020/12/14' 104 132 379.3\n",
"15 60 '2020/12/15' 98 123 275.0\n",
"16 60 '2020/12/16' 98 120 215.2\n",
"17 60 '2020/12/17' 100 120 300.0\n",
"18 45 '2020/12/18' 90 112 NaN\n",
"19 60 '2020/12/19' 103 123 323.0\n",
"20 45 '2020/12/20' 97 125 243.0\n",
"21 60 '2020/12/21' 108 131 364.2\n",
"22 45 NaN 100 119 282.0\n",
"23 60 '2020/12/23' 130 101 300.0\n",
"24 45 '2020/12/24' 105 132 246.0\n",
"25 60 '2020/12/25' 102 126 334.5\n",
"26 60 20201226 100 120 250.0\n",
"27 60 '2020/12/27' 92 118 241.0\n",
"28 60 '2020/12/28' 103 132 NaN\n",
"29 60 '2020/12/29' 100 132 280.0\n",
"30 60 '2020/12/30' 102 129 380.3\n",
"31 60 '2020/12/31' 92 115 243.0"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "2baf29d8-cd8f-4dfd-931a-c413a995320e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Duration</th>\n",
" <th>Date</th>\n",
" <th>Pulse</th>\n",
" <th>Maxpulse</th>\n",
" <th>Calories</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>60</td>\n",
" <td>'2020/12/01'</td>\n",
" <td>110</td>\n",
" <td>130</td>\n",
" <td>409.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>60</td>\n",
" <td>'2020/12/02'</td>\n",
" <td>117</td>\n",
" <td>145</td>\n",
" <td>479.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>60</td>\n",
" <td>'2020/12/03'</td>\n",
" <td>103</td>\n",
" <td>135</td>\n",
" <td>340.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>45</td>\n",
" <td>'2020/12/04'</td>\n",
" <td>109</td>\n",
" <td>175</td>\n",
" <td>282.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>45</td>\n",
" <td>'2020/12/05'</td>\n",
" <td>117</td>\n",
" <td>148</td>\n",
" <td>406.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>60</td>\n",
" <td>'2020/12/06'</td>\n",
" <td>102</td>\n",
" <td>127</td>\n",
" <td>300.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>60</td>\n",
" <td>'2020/12/07'</td>\n",
" <td>110</td>\n",
" <td>136</td>\n",
" <td>374.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>450</td>\n",
" <td>'2020/12/08'</td>\n",
" <td>104</td>\n",
" <td>134</td>\n",
" <td>253.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>30</td>\n",
" <td>'2020/12/09'</td>\n",
" <td>109</td>\n",
" <td>133</td>\n",
" <td>195.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>60</td>\n",
" <td>'2020/12/10'</td>\n",
" <td>98</td>\n",
" <td>124</td>\n",
" <td>269.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>60</td>\n",
" <td>'2020/12/11'</td>\n",
" <td>103</td>\n",
" <td>147</td>\n",
" <td>329.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>60</td>\n",
" <td>'2020/12/12'</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>60</td>\n",
" <td>'2020/12/12'</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>60</td>\n",
" <td>'2020/12/13'</td>\n",
" <td>106</td>\n",
" <td>128</td>\n",
" <td>345.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>60</td>\n",
" <td>'2020/12/14'</td>\n",
" <td>104</td>\n",
" <td>132</td>\n",
" <td>379.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>60</td>\n",
" <td>'2020/12/15'</td>\n",
" <td>98</td>\n",
" <td>123</td>\n",
" <td>275.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>60</td>\n",
" <td>'2020/12/16'</td>\n",
" <td>98</td>\n",
" <td>120</td>\n",
" <td>215.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>60</td>\n",
" <td>'2020/12/17'</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>300.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>60</td>\n",
" <td>'2020/12/19'</td>\n",
" <td>103</td>\n",
" <td>123</td>\n",
" <td>323.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>45</td>\n",
" <td>'2020/12/20'</td>\n",
" <td>97</td>\n",
" <td>125</td>\n",
" <td>243.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>60</td>\n",
" <td>'2020/12/21'</td>\n",
" <td>108</td>\n",
" <td>131</td>\n",
" <td>364.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>60</td>\n",
" <td>'2020/12/23'</td>\n",
" <td>130</td>\n",
" <td>101</td>\n",
" <td>300.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>45</td>\n",
" <td>'2020/12/24'</td>\n",
" <td>105</td>\n",
" <td>132</td>\n",
" <td>246.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>60</td>\n",
" <td>'2020/12/25'</td>\n",
" <td>102</td>\n",
" <td>126</td>\n",
" <td>334.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>60</td>\n",
" <td>20201226</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>60</td>\n",
" <td>'2020/12/27'</td>\n",
" <td>92</td>\n",
" <td>118</td>\n",
" <td>241.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>60</td>\n",
" <td>'2020/12/29'</td>\n",
" <td>100</td>\n",
" <td>132</td>\n",
" <td>280.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>60</td>\n",
" <td>'2020/12/30'</td>\n",
" <td>102</td>\n",
" <td>129</td>\n",
" <td>380.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>60</td>\n",
" <td>'2020/12/31'</td>\n",
" <td>92</td>\n",
" <td>115</td>\n",
" <td>243.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Duration Date Pulse Maxpulse Calories\n",
"0 60 '2020/12/01' 110 130 409.1\n",
"1 60 '2020/12/02' 117 145 479.0\n",
"2 60 '2020/12/03' 103 135 340.0\n",
"3 45 '2020/12/04' 109 175 282.4\n",
"4 45 '2020/12/05' 117 148 406.0\n",
"5 60 '2020/12/06' 102 127 300.0\n",
"6 60 '2020/12/07' 110 136 374.0\n",
"7 450 '2020/12/08' 104 134 253.3\n",
"8 30 '2020/12/09' 109 133 195.1\n",
"9 60 '2020/12/10' 98 124 269.0\n",
"10 60 '2020/12/11' 103 147 329.3\n",
"11 60 '2020/12/12' 100 120 250.7\n",
"12 60 '2020/12/12' 100 120 250.7\n",
"13 60 '2020/12/13' 106 128 345.3\n",
"14 60 '2020/12/14' 104 132 379.3\n",
"15 60 '2020/12/15' 98 123 275.0\n",
"16 60 '2020/12/16' 98 120 215.2\n",
"17 60 '2020/12/17' 100 120 300.0\n",
"19 60 '2020/12/19' 103 123 323.0\n",
"20 45 '2020/12/20' 97 125 243.0\n",
"21 60 '2020/12/21' 108 131 364.2\n",
"23 60 '2020/12/23' 130 101 300.0\n",
"24 45 '2020/12/24' 105 132 246.0\n",
"25 60 '2020/12/25' 102 126 334.5\n",
"26 60 20201226 100 120 250.0\n",
"27 60 '2020/12/27' 92 118 241.0\n",
"29 60 '2020/12/29' 100 132 280.0\n",
"30 60 '2020/12/30' 102 129 380.3\n",
"31 60 '2020/12/31' 92 115 243.0"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# drop null/NaN\n",
"new_df = df.dropna()\n",
"new_df"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "86df1d5f-639e-4f0b-8576-eb4a9dbee188",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 29 entries, 0 to 31\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Duration 29 non-null int64 \n",
" 1 Date 29 non-null object \n",
" 2 Pulse 29 non-null int64 \n",
" 3 Maxpulse 29 non-null int64 \n",
" 4 Calories 29 non-null float64\n",
"dtypes: float64(1), int64(3), object(1)\n",
"memory usage: 1.4+ KB\n"
]
}
],
"source": [
"df.locnew_df.info()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "37533007-2851-49da-8fca-2e9d3b74c406",
"metadata": {},
"outputs": [],
"source": [
"# hint df.dropna(inplace = True) <- manipulates orginal df"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e94f0608-1928-4dec-b28c-3f56d72b1867",
"metadata": {},
"outputs": [],
"source": [
"# fill missing values\n",
"# df.fillna(130, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "025cec14-2687-4ec5-9fa9-f10f1da927ea",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_733/185300893.py:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
"\n",
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
"\n",
"\n",
" df[\"Calories\"].fillna(x, inplace=True)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Duration</th>\n",
" <th>Date</th>\n",
" <th>Pulse</th>\n",
" <th>Maxpulse</th>\n",
" <th>Calories</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>60</td>\n",
" <td>'2020/12/01'</td>\n",
" <td>110</td>\n",
" <td>130</td>\n",
" <td>409.10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>60</td>\n",
" <td>'2020/12/02'</td>\n",
" <td>117</td>\n",
" <td>145</td>\n",
" <td>479.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>60</td>\n",
" <td>'2020/12/03'</td>\n",
" <td>103</td>\n",
" <td>135</td>\n",
" <td>340.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>45</td>\n",
" <td>'2020/12/04'</td>\n",
" <td>109</td>\n",
" <td>175</td>\n",
" <td>282.40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>45</td>\n",
" <td>'2020/12/05'</td>\n",
" <td>117</td>\n",
" <td>148</td>\n",
" <td>406.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>60</td>\n",
" <td>'2020/12/06'</td>\n",
" <td>102</td>\n",
" <td>127</td>\n",
" <td>300.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>60</td>\n",
" <td>'2020/12/07'</td>\n",
" <td>110</td>\n",
" <td>136</td>\n",
" <td>374.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>450</td>\n",
" <td>'2020/12/08'</td>\n",
" <td>104</td>\n",
" <td>134</td>\n",
" <td>253.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>30</td>\n",
" <td>'2020/12/09'</td>\n",
" <td>109</td>\n",
" <td>133</td>\n",
" <td>195.10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>60</td>\n",
" <td>'2020/12/10'</td>\n",
" <td>98</td>\n",
" <td>124</td>\n",
" <td>269.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>60</td>\n",
" <td>'2020/12/11'</td>\n",
" <td>103</td>\n",
" <td>147</td>\n",
" <td>329.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>60</td>\n",
" <td>'2020/12/12'</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>60</td>\n",
" <td>'2020/12/12'</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>60</td>\n",
" <td>'2020/12/13'</td>\n",
" <td>106</td>\n",
" <td>128</td>\n",
" <td>345.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>60</td>\n",
" <td>'2020/12/14'</td>\n",
" <td>104</td>\n",
" <td>132</td>\n",
" <td>379.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>60</td>\n",
" <td>'2020/12/15'</td>\n",
" <td>98</td>\n",
" <td>123</td>\n",
" <td>275.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>60</td>\n",
" <td>'2020/12/16'</td>\n",
" <td>98</td>\n",
" <td>120</td>\n",
" <td>215.20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>60</td>\n",
" <td>'2020/12/17'</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>300.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>45</td>\n",
" <td>'2020/12/18'</td>\n",
" <td>90</td>\n",
" <td>112</td>\n",
" <td>304.68</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>60</td>\n",
" <td>'2020/12/19'</td>\n",
" <td>103</td>\n",
" <td>123</td>\n",
" <td>323.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>45</td>\n",
" <td>'2020/12/20'</td>\n",
" <td>97</td>\n",
" <td>125</td>\n",
" <td>243.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>60</td>\n",
" <td>'2020/12/21'</td>\n",
" <td>108</td>\n",
" <td>131</td>\n",
" <td>364.20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>45</td>\n",
" <td>NaN</td>\n",
" <td>100</td>\n",
" <td>119</td>\n",
" <td>282.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>60</td>\n",
" <td>'2020/12/23'</td>\n",
" <td>130</td>\n",
" <td>101</td>\n",
" <td>300.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>45</td>\n",
" <td>'2020/12/24'</td>\n",
" <td>105</td>\n",
" <td>132</td>\n",
" <td>246.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>60</td>\n",
" <td>'2020/12/25'</td>\n",
" <td>102</td>\n",
" <td>126</td>\n",
" <td>334.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>60</td>\n",
" <td>20201226</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>60</td>\n",
" <td>'2020/12/27'</td>\n",
" <td>92</td>\n",
" <td>118</td>\n",
" <td>241.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>60</td>\n",
" <td>'2020/12/28'</td>\n",
" <td>103</td>\n",
" <td>132</td>\n",
" <td>304.68</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>60</td>\n",
" <td>'2020/12/29'</td>\n",
" <td>100</td>\n",
" <td>132</td>\n",
" <td>280.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>60</td>\n",
" <td>'2020/12/30'</td>\n",
" <td>102</td>\n",
" <td>129</td>\n",
" <td>380.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>60</td>\n",
" <td>'2020/12/31'</td>\n",
" <td>92</td>\n",
" <td>115</td>\n",
" <td>243.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Duration Date Pulse Maxpulse Calories\n",
"0 60 '2020/12/01' 110 130 409.10\n",
"1 60 '2020/12/02' 117 145 479.00\n",
"2 60 '2020/12/03' 103 135 340.00\n",
"3 45 '2020/12/04' 109 175 282.40\n",
"4 45 '2020/12/05' 117 148 406.00\n",
"5 60 '2020/12/06' 102 127 300.00\n",
"6 60 '2020/12/07' 110 136 374.00\n",
"7 450 '2020/12/08' 104 134 253.30\n",
"8 30 '2020/12/09' 109 133 195.10\n",
"9 60 '2020/12/10' 98 124 269.00\n",
"10 60 '2020/12/11' 103 147 329.30\n",
"11 60 '2020/12/12' 100 120 250.70\n",
"12 60 '2020/12/12' 100 120 250.70\n",
"13 60 '2020/12/13' 106 128 345.30\n",
"14 60 '2020/12/14' 104 132 379.30\n",
"15 60 '2020/12/15' 98 123 275.00\n",
"16 60 '2020/12/16' 98 120 215.20\n",
"17 60 '2020/12/17' 100 120 300.00\n",
"18 45 '2020/12/18' 90 112 304.68\n",
"19 60 '2020/12/19' 103 123 323.00\n",
"20 45 '2020/12/20' 97 125 243.00\n",
"21 60 '2020/12/21' 108 131 364.20\n",
"22 45 NaN 100 119 282.00\n",
"23 60 '2020/12/23' 130 101 300.00\n",
"24 45 '2020/12/24' 105 132 246.00\n",
"25 60 '2020/12/25' 102 126 334.50\n",
"26 60 20201226 100 120 250.00\n",
"27 60 '2020/12/27' 92 118 241.00\n",
"28 60 '2020/12/28' 103 132 304.68\n",
"29 60 '2020/12/29' 100 132 280.00\n",
"30 60 '2020/12/30' 102 129 380.30\n",
"31 60 '2020/12/31' 92 115 243.00"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = df[\"Calories\"].mean()\n",
"\n",
"df[\"Calories\"].fillna(x, inplace=True)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d2e87f3b-ef58-4128-b52f-799056e56de8",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_733/2663698494.py:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
"The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
"\n",
"For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
"\n",
"\n",
" df[\"Calories\"].fillna(x, inplace = True)\n"
]
}
],
"source": [
"x = df[\"Calories\"].median()\n",
"\n",
"df[\"Calories\"].fillna(x, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "c42df786-aa1b-4174-b436-566421f1683b",
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "time data \"20201226\" doesn't match format \"'%Y/%m/%d'\", at position 26. You might want to try:\n - passing `format` if your strings have a consistent format;\n - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;\n - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[9], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# convert into proper data type\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mDate\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_datetime\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mDate\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m df\n",
"File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/core/tools/datetimes.py:1067\u001b[0m, in \u001b[0;36mto_datetime\u001b[0;34m(arg, errors, dayfirst, yearfirst, utc, format, exact, unit, infer_datetime_format, origin, cache)\u001b[0m\n\u001b[1;32m 1065\u001b[0m result \u001b[38;5;241m=\u001b[39m arg\u001b[38;5;241m.\u001b[39mmap(cache_array)\n\u001b[1;32m 1066\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1067\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[43mconvert_listlike\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_values\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mformat\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1068\u001b[0m result \u001b[38;5;241m=\u001b[39m arg\u001b[38;5;241m.\u001b[39m_constructor(values, index\u001b[38;5;241m=\u001b[39marg\u001b[38;5;241m.\u001b[39mindex, name\u001b[38;5;241m=\u001b[39marg\u001b[38;5;241m.\u001b[39mname)\n\u001b[1;32m 1069\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(arg, (ABCDataFrame, abc\u001b[38;5;241m.\u001b[39mMutableMapping)):\n",
"File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/core/tools/datetimes.py:433\u001b[0m, in \u001b[0;36m_convert_listlike_datetimes\u001b[0;34m(arg, format, name, utc, unit, errors, dayfirst, yearfirst, exact)\u001b[0m\n\u001b[1;32m 431\u001b[0m \u001b[38;5;66;03m# `format` could be inferred, or user didn't ask for mixed-format parsing.\u001b[39;00m\n\u001b[1;32m 432\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mformat\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mformat\u001b[39m \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmixed\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 433\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_array_strptime_with_fallback\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mutc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mformat\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexact\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 435\u001b[0m result, tz_parsed \u001b[38;5;241m=\u001b[39m objects_to_datetime64(\n\u001b[1;32m 436\u001b[0m arg,\n\u001b[1;32m 437\u001b[0m dayfirst\u001b[38;5;241m=\u001b[39mdayfirst,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 441\u001b[0m allow_object\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 442\u001b[0m )\n\u001b[1;32m 444\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m tz_parsed \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 445\u001b[0m \u001b[38;5;66;03m# We can take a shortcut since the datetime64 numpy array\u001b[39;00m\n\u001b[1;32m 446\u001b[0m \u001b[38;5;66;03m# is in UTC\u001b[39;00m\n",
"File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/pandas/core/tools/datetimes.py:467\u001b[0m, in \u001b[0;36m_array_strptime_with_fallback\u001b[0;34m(arg, name, utc, fmt, exact, errors)\u001b[0m\n\u001b[1;32m 456\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_array_strptime_with_fallback\u001b[39m(\n\u001b[1;32m 457\u001b[0m arg,\n\u001b[1;32m 458\u001b[0m name,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 462\u001b[0m errors: \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m 463\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Index:\n\u001b[1;32m 464\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 465\u001b[0m \u001b[38;5;124;03m Call array_strptime, with fallback behavior depending on 'errors'.\u001b[39;00m\n\u001b[1;32m 466\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 467\u001b[0m result, tz_out \u001b[38;5;241m=\u001b[39m \u001b[43marray_strptime\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfmt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexact\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexact\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mutc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mutc\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 468\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m tz_out \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 469\u001b[0m unit \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mdatetime_data(result\u001b[38;5;241m.\u001b[39mdtype)[\u001b[38;5;241m0\u001b[39m]\n",
"File \u001b[0;32mstrptime.pyx:501\u001b[0m, in \u001b[0;36mpandas._libs.tslibs.strptime.array_strptime\u001b[0;34m()\u001b[0m\n",
"File \u001b[0;32mstrptime.pyx:451\u001b[0m, in \u001b[0;36mpandas._libs.tslibs.strptime.array_strptime\u001b[0;34m()\u001b[0m\n",
"File \u001b[0;32mstrptime.pyx:583\u001b[0m, in \u001b[0;36mpandas._libs.tslibs.strptime._parse_with_format\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: time data \"20201226\" doesn't match format \"'%Y/%m/%d'\", at position 26. You might want to try:\n - passing `format` if your strings have a consistent format;\n - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;\n - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this."
]
}
],
"source": [
"# convert into proper data type\n",
"df['Date'] = pd.to_datetime(df['Date'])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6508edc2-f7f1-469b-a094-1b6c98a155e3",
"metadata": {},
"outputs": [],
"source": [
"# remove missing value according to a column\n",
"# df.dropna(subset=['Date'], inplace = True)"
]
},
{
"cell_type": "markdown",
"id": "725032e8-c03e-428e-a928-f5c2533a3446",
"metadata": {},
"source": [
"#### Fixing Wrong Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3367d5c9-90f8-4fb1-9c2b-bae2bdaeb7bf",
"metadata": {},
"outputs": [],
"source": [
"# row 7: 450 duration!\n",
"df.loc[7, 'Duration'] = 45"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a9ce891-9275-4539-a23c-4826fb258c1d",
"metadata": {},
"outputs": [],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7888f644-60a5-41e2-bd9f-acf1f5e08f5d",
"metadata": {},
"outputs": [],
"source": [
"# remove duplicates row 11 & 12\n",
"print(df.duplicated())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ff4ee9a2-dabb-4015-8b0c-5527f688bb21",
"metadata": {},
"outputs": [],
"source": [
"df.drop_duplicates(inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "44165eb4-ab0c-4be0-92d6-4c8ccf2ff389",
"metadata": {},
"outputs": [],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3033c2a4-18f1-4fcd-be75-f71f95c9097f",
"metadata": {},
"outputs": [],
"source": [
"df.to_csv('cleaned.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "549ea6b3-3903-4b74-88ad-74c60e7d862e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}