Technologische_Grundlagen/course/pandas/.ipynb_checkpoints/02_cleaning-checkpoint.ipynb

2207 lines
67 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "b0c0ae08-2fb5-47f5-a5ce-1a66e35791a4",
"metadata": {},
"source": [
"### Cleaning Data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "f9998a78-ae01-4531-b325-637b6d5ee86d",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "9516a86a-ed6a-4f79-b631-3195daec258c",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('https://gist.githubusercontent.com/maltegrosse/bdfd2c6a5e3bff315d92cd27c2461a48/raw/49d5672953360934601b3d252c9b78121eed10db/data.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ea25a32c-70d3-479d-8d11-7e487f13f50c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Duration</th>\n",
" <th>Date</th>\n",
" <th>Pulse</th>\n",
" <th>Maxpulse</th>\n",
" <th>Calories</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>60</td>\n",
" <td>'2020/12/01'</td>\n",
" <td>110</td>\n",
" <td>130</td>\n",
" <td>409.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>60</td>\n",
" <td>'2020/12/02'</td>\n",
" <td>117</td>\n",
" <td>145</td>\n",
" <td>479.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>60</td>\n",
" <td>'2020/12/03'</td>\n",
" <td>103</td>\n",
" <td>135</td>\n",
" <td>340.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>45</td>\n",
" <td>'2020/12/04'</td>\n",
" <td>109</td>\n",
" <td>175</td>\n",
" <td>282.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>45</td>\n",
" <td>'2020/12/05'</td>\n",
" <td>117</td>\n",
" <td>148</td>\n",
" <td>406.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>60</td>\n",
" <td>'2020/12/06'</td>\n",
" <td>102</td>\n",
" <td>127</td>\n",
" <td>300.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>60</td>\n",
" <td>'2020/12/07'</td>\n",
" <td>110</td>\n",
" <td>136</td>\n",
" <td>374.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>450</td>\n",
" <td>'2020/12/08'</td>\n",
" <td>104</td>\n",
" <td>134</td>\n",
" <td>253.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>30</td>\n",
" <td>'2020/12/09'</td>\n",
" <td>109</td>\n",
" <td>133</td>\n",
" <td>195.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>60</td>\n",
" <td>'2020/12/10'</td>\n",
" <td>98</td>\n",
" <td>124</td>\n",
" <td>269.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>60</td>\n",
" <td>'2020/12/11'</td>\n",
" <td>103</td>\n",
" <td>147</td>\n",
" <td>329.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>60</td>\n",
" <td>'2020/12/12'</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>60</td>\n",
" <td>'2020/12/12'</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>60</td>\n",
" <td>'2020/12/13'</td>\n",
" <td>106</td>\n",
" <td>128</td>\n",
" <td>345.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>60</td>\n",
" <td>'2020/12/14'</td>\n",
" <td>104</td>\n",
" <td>132</td>\n",
" <td>379.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>60</td>\n",
" <td>'2020/12/15'</td>\n",
" <td>98</td>\n",
" <td>123</td>\n",
" <td>275.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>60</td>\n",
" <td>'2020/12/16'</td>\n",
" <td>98</td>\n",
" <td>120</td>\n",
" <td>215.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>60</td>\n",
" <td>'2020/12/17'</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>300.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>45</td>\n",
" <td>'2020/12/18'</td>\n",
" <td>90</td>\n",
" <td>112</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>60</td>\n",
" <td>'2020/12/19'</td>\n",
" <td>103</td>\n",
" <td>123</td>\n",
" <td>323.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>45</td>\n",
" <td>'2020/12/20'</td>\n",
" <td>97</td>\n",
" <td>125</td>\n",
" <td>243.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>60</td>\n",
" <td>'2020/12/21'</td>\n",
" <td>108</td>\n",
" <td>131</td>\n",
" <td>364.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>45</td>\n",
" <td>NaN</td>\n",
" <td>100</td>\n",
" <td>119</td>\n",
" <td>282.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>60</td>\n",
" <td>'2020/12/23'</td>\n",
" <td>130</td>\n",
" <td>101</td>\n",
" <td>300.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>45</td>\n",
" <td>'2020/12/24'</td>\n",
" <td>105</td>\n",
" <td>132</td>\n",
" <td>246.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>60</td>\n",
" <td>'2020/12/25'</td>\n",
" <td>102</td>\n",
" <td>126</td>\n",
" <td>334.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>60</td>\n",
" <td>20201226</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>60</td>\n",
" <td>'2020/12/27'</td>\n",
" <td>92</td>\n",
" <td>118</td>\n",
" <td>241.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>60</td>\n",
" <td>'2020/12/28'</td>\n",
" <td>103</td>\n",
" <td>132</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>60</td>\n",
" <td>'2020/12/29'</td>\n",
" <td>100</td>\n",
" <td>132</td>\n",
" <td>280.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>60</td>\n",
" <td>'2020/12/30'</td>\n",
" <td>102</td>\n",
" <td>129</td>\n",
" <td>380.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>60</td>\n",
" <td>'2020/12/31'</td>\n",
" <td>92</td>\n",
" <td>115</td>\n",
" <td>243.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Duration Date Pulse Maxpulse Calories\n",
"0 60 '2020/12/01' 110 130 409.1\n",
"1 60 '2020/12/02' 117 145 479.0\n",
"2 60 '2020/12/03' 103 135 340.0\n",
"3 45 '2020/12/04' 109 175 282.4\n",
"4 45 '2020/12/05' 117 148 406.0\n",
"5 60 '2020/12/06' 102 127 300.0\n",
"6 60 '2020/12/07' 110 136 374.0\n",
"7 450 '2020/12/08' 104 134 253.3\n",
"8 30 '2020/12/09' 109 133 195.1\n",
"9 60 '2020/12/10' 98 124 269.0\n",
"10 60 '2020/12/11' 103 147 329.3\n",
"11 60 '2020/12/12' 100 120 250.7\n",
"12 60 '2020/12/12' 100 120 250.7\n",
"13 60 '2020/12/13' 106 128 345.3\n",
"14 60 '2020/12/14' 104 132 379.3\n",
"15 60 '2020/12/15' 98 123 275.0\n",
"16 60 '2020/12/16' 98 120 215.2\n",
"17 60 '2020/12/17' 100 120 300.0\n",
"18 45 '2020/12/18' 90 112 NaN\n",
"19 60 '2020/12/19' 103 123 323.0\n",
"20 45 '2020/12/20' 97 125 243.0\n",
"21 60 '2020/12/21' 108 131 364.2\n",
"22 45 NaN 100 119 282.0\n",
"23 60 '2020/12/23' 130 101 300.0\n",
"24 45 '2020/12/24' 105 132 246.0\n",
"25 60 '2020/12/25' 102 126 334.5\n",
"26 60 20201226 100 120 250.0\n",
"27 60 '2020/12/27' 92 118 241.0\n",
"28 60 '2020/12/28' 103 132 NaN\n",
"29 60 '2020/12/29' 100 132 280.0\n",
"30 60 '2020/12/30' 102 129 380.3\n",
"31 60 '2020/12/31' 92 115 243.0"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "2baf29d8-cd8f-4dfd-931a-c413a995320e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Duration</th>\n",
" <th>Date</th>\n",
" <th>Pulse</th>\n",
" <th>Maxpulse</th>\n",
" <th>Calories</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>60</td>\n",
" <td>'2020/12/01'</td>\n",
" <td>110</td>\n",
" <td>130</td>\n",
" <td>409.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>60</td>\n",
" <td>'2020/12/02'</td>\n",
" <td>117</td>\n",
" <td>145</td>\n",
" <td>479.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>60</td>\n",
" <td>'2020/12/03'</td>\n",
" <td>103</td>\n",
" <td>135</td>\n",
" <td>340.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>45</td>\n",
" <td>'2020/12/04'</td>\n",
" <td>109</td>\n",
" <td>175</td>\n",
" <td>282.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>45</td>\n",
" <td>'2020/12/05'</td>\n",
" <td>117</td>\n",
" <td>148</td>\n",
" <td>406.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>60</td>\n",
" <td>'2020/12/06'</td>\n",
" <td>102</td>\n",
" <td>127</td>\n",
" <td>300.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>60</td>\n",
" <td>'2020/12/07'</td>\n",
" <td>110</td>\n",
" <td>136</td>\n",
" <td>374.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>450</td>\n",
" <td>'2020/12/08'</td>\n",
" <td>104</td>\n",
" <td>134</td>\n",
" <td>253.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>30</td>\n",
" <td>'2020/12/09'</td>\n",
" <td>109</td>\n",
" <td>133</td>\n",
" <td>195.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>60</td>\n",
" <td>'2020/12/10'</td>\n",
" <td>98</td>\n",
" <td>124</td>\n",
" <td>269.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>60</td>\n",
" <td>'2020/12/11'</td>\n",
" <td>103</td>\n",
" <td>147</td>\n",
" <td>329.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>60</td>\n",
" <td>'2020/12/12'</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>60</td>\n",
" <td>'2020/12/12'</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>60</td>\n",
" <td>'2020/12/13'</td>\n",
" <td>106</td>\n",
" <td>128</td>\n",
" <td>345.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>60</td>\n",
" <td>'2020/12/14'</td>\n",
" <td>104</td>\n",
" <td>132</td>\n",
" <td>379.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>60</td>\n",
" <td>'2020/12/15'</td>\n",
" <td>98</td>\n",
" <td>123</td>\n",
" <td>275.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>60</td>\n",
" <td>'2020/12/16'</td>\n",
" <td>98</td>\n",
" <td>120</td>\n",
" <td>215.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>60</td>\n",
" <td>'2020/12/17'</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>300.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>60</td>\n",
" <td>'2020/12/19'</td>\n",
" <td>103</td>\n",
" <td>123</td>\n",
" <td>323.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>45</td>\n",
" <td>'2020/12/20'</td>\n",
" <td>97</td>\n",
" <td>125</td>\n",
" <td>243.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>60</td>\n",
" <td>'2020/12/21'</td>\n",
" <td>108</td>\n",
" <td>131</td>\n",
" <td>364.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>60</td>\n",
" <td>'2020/12/23'</td>\n",
" <td>130</td>\n",
" <td>101</td>\n",
" <td>300.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>45</td>\n",
" <td>'2020/12/24'</td>\n",
" <td>105</td>\n",
" <td>132</td>\n",
" <td>246.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>60</td>\n",
" <td>'2020/12/25'</td>\n",
" <td>102</td>\n",
" <td>126</td>\n",
" <td>334.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>60</td>\n",
" <td>20201226</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>60</td>\n",
" <td>'2020/12/27'</td>\n",
" <td>92</td>\n",
" <td>118</td>\n",
" <td>241.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>60</td>\n",
" <td>'2020/12/29'</td>\n",
" <td>100</td>\n",
" <td>132</td>\n",
" <td>280.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>60</td>\n",
" <td>'2020/12/30'</td>\n",
" <td>102</td>\n",
" <td>129</td>\n",
" <td>380.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>60</td>\n",
" <td>'2020/12/31'</td>\n",
" <td>92</td>\n",
" <td>115</td>\n",
" <td>243.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Duration Date Pulse Maxpulse Calories\n",
"0 60 '2020/12/01' 110 130 409.1\n",
"1 60 '2020/12/02' 117 145 479.0\n",
"2 60 '2020/12/03' 103 135 340.0\n",
"3 45 '2020/12/04' 109 175 282.4\n",
"4 45 '2020/12/05' 117 148 406.0\n",
"5 60 '2020/12/06' 102 127 300.0\n",
"6 60 '2020/12/07' 110 136 374.0\n",
"7 450 '2020/12/08' 104 134 253.3\n",
"8 30 '2020/12/09' 109 133 195.1\n",
"9 60 '2020/12/10' 98 124 269.0\n",
"10 60 '2020/12/11' 103 147 329.3\n",
"11 60 '2020/12/12' 100 120 250.7\n",
"12 60 '2020/12/12' 100 120 250.7\n",
"13 60 '2020/12/13' 106 128 345.3\n",
"14 60 '2020/12/14' 104 132 379.3\n",
"15 60 '2020/12/15' 98 123 275.0\n",
"16 60 '2020/12/16' 98 120 215.2\n",
"17 60 '2020/12/17' 100 120 300.0\n",
"19 60 '2020/12/19' 103 123 323.0\n",
"20 45 '2020/12/20' 97 125 243.0\n",
"21 60 '2020/12/21' 108 131 364.2\n",
"23 60 '2020/12/23' 130 101 300.0\n",
"24 45 '2020/12/24' 105 132 246.0\n",
"25 60 '2020/12/25' 102 126 334.5\n",
"26 60 20201226 100 120 250.0\n",
"27 60 '2020/12/27' 92 118 241.0\n",
"29 60 '2020/12/29' 100 132 280.0\n",
"30 60 '2020/12/30' 102 129 380.3\n",
"31 60 '2020/12/31' 92 115 243.0"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# drop null/NaN\n",
"new_df = df.dropna()\n",
"new_df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "37533007-2851-49da-8fca-2e9d3b74c406",
"metadata": {},
"outputs": [],
"source": [
"# hint df.dropna(inplace = True) <- manipulates orginal df"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e94f0608-1928-4dec-b28c-3f56d72b1867",
"metadata": {},
"outputs": [],
"source": [
"# fill missing values\n",
"# df.fillna(130, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "025cec14-2687-4ec5-9fa9-f10f1da927ea",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Duration</th>\n",
" <th>Date</th>\n",
" <th>Pulse</th>\n",
" <th>Maxpulse</th>\n",
" <th>Calories</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>60</td>\n",
" <td>'2020/12/01'</td>\n",
" <td>110</td>\n",
" <td>130</td>\n",
" <td>409.10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>60</td>\n",
" <td>'2020/12/02'</td>\n",
" <td>117</td>\n",
" <td>145</td>\n",
" <td>479.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>60</td>\n",
" <td>'2020/12/03'</td>\n",
" <td>103</td>\n",
" <td>135</td>\n",
" <td>340.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>45</td>\n",
" <td>'2020/12/04'</td>\n",
" <td>109</td>\n",
" <td>175</td>\n",
" <td>282.40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>45</td>\n",
" <td>'2020/12/05'</td>\n",
" <td>117</td>\n",
" <td>148</td>\n",
" <td>406.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>60</td>\n",
" <td>'2020/12/06'</td>\n",
" <td>102</td>\n",
" <td>127</td>\n",
" <td>300.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>60</td>\n",
" <td>'2020/12/07'</td>\n",
" <td>110</td>\n",
" <td>136</td>\n",
" <td>374.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>450</td>\n",
" <td>'2020/12/08'</td>\n",
" <td>104</td>\n",
" <td>134</td>\n",
" <td>253.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>30</td>\n",
" <td>'2020/12/09'</td>\n",
" <td>109</td>\n",
" <td>133</td>\n",
" <td>195.10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>60</td>\n",
" <td>'2020/12/10'</td>\n",
" <td>98</td>\n",
" <td>124</td>\n",
" <td>269.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>60</td>\n",
" <td>'2020/12/11'</td>\n",
" <td>103</td>\n",
" <td>147</td>\n",
" <td>329.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>60</td>\n",
" <td>'2020/12/12'</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>60</td>\n",
" <td>'2020/12/12'</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>60</td>\n",
" <td>'2020/12/13'</td>\n",
" <td>106</td>\n",
" <td>128</td>\n",
" <td>345.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>60</td>\n",
" <td>'2020/12/14'</td>\n",
" <td>104</td>\n",
" <td>132</td>\n",
" <td>379.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>60</td>\n",
" <td>'2020/12/15'</td>\n",
" <td>98</td>\n",
" <td>123</td>\n",
" <td>275.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>60</td>\n",
" <td>'2020/12/16'</td>\n",
" <td>98</td>\n",
" <td>120</td>\n",
" <td>215.20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>60</td>\n",
" <td>'2020/12/17'</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>300.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>45</td>\n",
" <td>'2020/12/18'</td>\n",
" <td>90</td>\n",
" <td>112</td>\n",
" <td>304.68</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>60</td>\n",
" <td>'2020/12/19'</td>\n",
" <td>103</td>\n",
" <td>123</td>\n",
" <td>323.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>45</td>\n",
" <td>'2020/12/20'</td>\n",
" <td>97</td>\n",
" <td>125</td>\n",
" <td>243.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>60</td>\n",
" <td>'2020/12/21'</td>\n",
" <td>108</td>\n",
" <td>131</td>\n",
" <td>364.20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>45</td>\n",
" <td>NaN</td>\n",
" <td>100</td>\n",
" <td>119</td>\n",
" <td>282.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>60</td>\n",
" <td>'2020/12/23'</td>\n",
" <td>130</td>\n",
" <td>101</td>\n",
" <td>300.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>45</td>\n",
" <td>'2020/12/24'</td>\n",
" <td>105</td>\n",
" <td>132</td>\n",
" <td>246.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>60</td>\n",
" <td>'2020/12/25'</td>\n",
" <td>102</td>\n",
" <td>126</td>\n",
" <td>334.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>60</td>\n",
" <td>20201226</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>60</td>\n",
" <td>'2020/12/27'</td>\n",
" <td>92</td>\n",
" <td>118</td>\n",
" <td>241.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>60</td>\n",
" <td>'2020/12/28'</td>\n",
" <td>103</td>\n",
" <td>132</td>\n",
" <td>304.68</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>60</td>\n",
" <td>'2020/12/29'</td>\n",
" <td>100</td>\n",
" <td>132</td>\n",
" <td>280.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>60</td>\n",
" <td>'2020/12/30'</td>\n",
" <td>102</td>\n",
" <td>129</td>\n",
" <td>380.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>60</td>\n",
" <td>'2020/12/31'</td>\n",
" <td>92</td>\n",
" <td>115</td>\n",
" <td>243.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Duration Date Pulse Maxpulse Calories\n",
"0 60 '2020/12/01' 110 130 409.10\n",
"1 60 '2020/12/02' 117 145 479.00\n",
"2 60 '2020/12/03' 103 135 340.00\n",
"3 45 '2020/12/04' 109 175 282.40\n",
"4 45 '2020/12/05' 117 148 406.00\n",
"5 60 '2020/12/06' 102 127 300.00\n",
"6 60 '2020/12/07' 110 136 374.00\n",
"7 450 '2020/12/08' 104 134 253.30\n",
"8 30 '2020/12/09' 109 133 195.10\n",
"9 60 '2020/12/10' 98 124 269.00\n",
"10 60 '2020/12/11' 103 147 329.30\n",
"11 60 '2020/12/12' 100 120 250.70\n",
"12 60 '2020/12/12' 100 120 250.70\n",
"13 60 '2020/12/13' 106 128 345.30\n",
"14 60 '2020/12/14' 104 132 379.30\n",
"15 60 '2020/12/15' 98 123 275.00\n",
"16 60 '2020/12/16' 98 120 215.20\n",
"17 60 '2020/12/17' 100 120 300.00\n",
"18 45 '2020/12/18' 90 112 304.68\n",
"19 60 '2020/12/19' 103 123 323.00\n",
"20 45 '2020/12/20' 97 125 243.00\n",
"21 60 '2020/12/21' 108 131 364.20\n",
"22 45 NaN 100 119 282.00\n",
"23 60 '2020/12/23' 130 101 300.00\n",
"24 45 '2020/12/24' 105 132 246.00\n",
"25 60 '2020/12/25' 102 126 334.50\n",
"26 60 20201226 100 120 250.00\n",
"27 60 '2020/12/27' 92 118 241.00\n",
"28 60 '2020/12/28' 103 132 304.68\n",
"29 60 '2020/12/29' 100 132 280.00\n",
"30 60 '2020/12/30' 102 129 380.30\n",
"31 60 '2020/12/31' 92 115 243.00"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = df[\"Calories\"].mean()\n",
"\n",
"df[\"Calories\"].fillna(x, inplace=True)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "d2e87f3b-ef58-4128-b52f-799056e56de8",
"metadata": {},
"outputs": [],
"source": [
"x = df[\"Calories\"].median()\n",
"\n",
"df[\"Calories\"].fillna(x, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "c42df786-aa1b-4174-b436-566421f1683b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Duration</th>\n",
" <th>Date</th>\n",
" <th>Pulse</th>\n",
" <th>Maxpulse</th>\n",
" <th>Calories</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>60</td>\n",
" <td>2020-12-01</td>\n",
" <td>110</td>\n",
" <td>130</td>\n",
" <td>409.10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>60</td>\n",
" <td>2020-12-02</td>\n",
" <td>117</td>\n",
" <td>145</td>\n",
" <td>479.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>60</td>\n",
" <td>2020-12-03</td>\n",
" <td>103</td>\n",
" <td>135</td>\n",
" <td>340.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>45</td>\n",
" <td>2020-12-04</td>\n",
" <td>109</td>\n",
" <td>175</td>\n",
" <td>282.40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>45</td>\n",
" <td>2020-12-05</td>\n",
" <td>117</td>\n",
" <td>148</td>\n",
" <td>406.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>60</td>\n",
" <td>2020-12-06</td>\n",
" <td>102</td>\n",
" <td>127</td>\n",
" <td>300.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>60</td>\n",
" <td>2020-12-07</td>\n",
" <td>110</td>\n",
" <td>136</td>\n",
" <td>374.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>450</td>\n",
" <td>2020-12-08</td>\n",
" <td>104</td>\n",
" <td>134</td>\n",
" <td>253.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>30</td>\n",
" <td>2020-12-09</td>\n",
" <td>109</td>\n",
" <td>133</td>\n",
" <td>195.10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>60</td>\n",
" <td>2020-12-10</td>\n",
" <td>98</td>\n",
" <td>124</td>\n",
" <td>269.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>60</td>\n",
" <td>2020-12-11</td>\n",
" <td>103</td>\n",
" <td>147</td>\n",
" <td>329.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>60</td>\n",
" <td>2020-12-12</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>60</td>\n",
" <td>2020-12-12</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>60</td>\n",
" <td>2020-12-13</td>\n",
" <td>106</td>\n",
" <td>128</td>\n",
" <td>345.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>60</td>\n",
" <td>2020-12-14</td>\n",
" <td>104</td>\n",
" <td>132</td>\n",
" <td>379.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>60</td>\n",
" <td>2020-12-15</td>\n",
" <td>98</td>\n",
" <td>123</td>\n",
" <td>275.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>60</td>\n",
" <td>2020-12-16</td>\n",
" <td>98</td>\n",
" <td>120</td>\n",
" <td>215.20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>60</td>\n",
" <td>2020-12-17</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>300.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>45</td>\n",
" <td>2020-12-18</td>\n",
" <td>90</td>\n",
" <td>112</td>\n",
" <td>304.68</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>60</td>\n",
" <td>2020-12-19</td>\n",
" <td>103</td>\n",
" <td>123</td>\n",
" <td>323.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>45</td>\n",
" <td>2020-12-20</td>\n",
" <td>97</td>\n",
" <td>125</td>\n",
" <td>243.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>60</td>\n",
" <td>2020-12-21</td>\n",
" <td>108</td>\n",
" <td>131</td>\n",
" <td>364.20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>45</td>\n",
" <td>NaT</td>\n",
" <td>100</td>\n",
" <td>119</td>\n",
" <td>282.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>60</td>\n",
" <td>2020-12-23</td>\n",
" <td>130</td>\n",
" <td>101</td>\n",
" <td>300.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>45</td>\n",
" <td>2020-12-24</td>\n",
" <td>105</td>\n",
" <td>132</td>\n",
" <td>246.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>60</td>\n",
" <td>2020-12-25</td>\n",
" <td>102</td>\n",
" <td>126</td>\n",
" <td>334.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>60</td>\n",
" <td>2020-12-26</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>60</td>\n",
" <td>2020-12-27</td>\n",
" <td>92</td>\n",
" <td>118</td>\n",
" <td>241.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>60</td>\n",
" <td>2020-12-28</td>\n",
" <td>103</td>\n",
" <td>132</td>\n",
" <td>304.68</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>60</td>\n",
" <td>2020-12-29</td>\n",
" <td>100</td>\n",
" <td>132</td>\n",
" <td>280.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>60</td>\n",
" <td>2020-12-30</td>\n",
" <td>102</td>\n",
" <td>129</td>\n",
" <td>380.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>60</td>\n",
" <td>2020-12-31</td>\n",
" <td>92</td>\n",
" <td>115</td>\n",
" <td>243.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Duration Date Pulse Maxpulse Calories\n",
"0 60 2020-12-01 110 130 409.10\n",
"1 60 2020-12-02 117 145 479.00\n",
"2 60 2020-12-03 103 135 340.00\n",
"3 45 2020-12-04 109 175 282.40\n",
"4 45 2020-12-05 117 148 406.00\n",
"5 60 2020-12-06 102 127 300.00\n",
"6 60 2020-12-07 110 136 374.00\n",
"7 450 2020-12-08 104 134 253.30\n",
"8 30 2020-12-09 109 133 195.10\n",
"9 60 2020-12-10 98 124 269.00\n",
"10 60 2020-12-11 103 147 329.30\n",
"11 60 2020-12-12 100 120 250.70\n",
"12 60 2020-12-12 100 120 250.70\n",
"13 60 2020-12-13 106 128 345.30\n",
"14 60 2020-12-14 104 132 379.30\n",
"15 60 2020-12-15 98 123 275.00\n",
"16 60 2020-12-16 98 120 215.20\n",
"17 60 2020-12-17 100 120 300.00\n",
"18 45 2020-12-18 90 112 304.68\n",
"19 60 2020-12-19 103 123 323.00\n",
"20 45 2020-12-20 97 125 243.00\n",
"21 60 2020-12-21 108 131 364.20\n",
"22 45 NaT 100 119 282.00\n",
"23 60 2020-12-23 130 101 300.00\n",
"24 45 2020-12-24 105 132 246.00\n",
"25 60 2020-12-25 102 126 334.50\n",
"26 60 2020-12-26 100 120 250.00\n",
"27 60 2020-12-27 92 118 241.00\n",
"28 60 2020-12-28 103 132 304.68\n",
"29 60 2020-12-29 100 132 280.00\n",
"30 60 2020-12-30 102 129 380.30\n",
"31 60 2020-12-31 92 115 243.00"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# convert into proper data type\n",
"df['Date'] = pd.to_datetime(df['Date'])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "6508edc2-f7f1-469b-a094-1b6c98a155e3",
"metadata": {},
"outputs": [],
"source": [
"# remove missing value according to a column\n",
"# df.dropna(subset=['Date'], inplace = True)"
]
},
{
"cell_type": "markdown",
"id": "725032e8-c03e-428e-a928-f5c2533a3446",
"metadata": {},
"source": [
"#### Fixing Wrong Data"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "3367d5c9-90f8-4fb1-9c2b-bae2bdaeb7bf",
"metadata": {},
"outputs": [],
"source": [
"# row 7: 450 duration!\n",
"df.loc[7, 'Duration'] = 45"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "1a9ce891-9275-4539-a23c-4826fb258c1d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Duration</th>\n",
" <th>Date</th>\n",
" <th>Pulse</th>\n",
" <th>Maxpulse</th>\n",
" <th>Calories</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>60</td>\n",
" <td>2020-12-01</td>\n",
" <td>110</td>\n",
" <td>130</td>\n",
" <td>409.10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>60</td>\n",
" <td>2020-12-02</td>\n",
" <td>117</td>\n",
" <td>145</td>\n",
" <td>479.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>60</td>\n",
" <td>2020-12-03</td>\n",
" <td>103</td>\n",
" <td>135</td>\n",
" <td>340.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>45</td>\n",
" <td>2020-12-04</td>\n",
" <td>109</td>\n",
" <td>175</td>\n",
" <td>282.40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>45</td>\n",
" <td>2020-12-05</td>\n",
" <td>117</td>\n",
" <td>148</td>\n",
" <td>406.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>60</td>\n",
" <td>2020-12-06</td>\n",
" <td>102</td>\n",
" <td>127</td>\n",
" <td>300.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>60</td>\n",
" <td>2020-12-07</td>\n",
" <td>110</td>\n",
" <td>136</td>\n",
" <td>374.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>45</td>\n",
" <td>2020-12-08</td>\n",
" <td>104</td>\n",
" <td>134</td>\n",
" <td>253.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>30</td>\n",
" <td>2020-12-09</td>\n",
" <td>109</td>\n",
" <td>133</td>\n",
" <td>195.10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>60</td>\n",
" <td>2020-12-10</td>\n",
" <td>98</td>\n",
" <td>124</td>\n",
" <td>269.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>60</td>\n",
" <td>2020-12-11</td>\n",
" <td>103</td>\n",
" <td>147</td>\n",
" <td>329.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>60</td>\n",
" <td>2020-12-12</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>60</td>\n",
" <td>2020-12-12</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>60</td>\n",
" <td>2020-12-13</td>\n",
" <td>106</td>\n",
" <td>128</td>\n",
" <td>345.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>60</td>\n",
" <td>2020-12-14</td>\n",
" <td>104</td>\n",
" <td>132</td>\n",
" <td>379.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>60</td>\n",
" <td>2020-12-15</td>\n",
" <td>98</td>\n",
" <td>123</td>\n",
" <td>275.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>60</td>\n",
" <td>2020-12-16</td>\n",
" <td>98</td>\n",
" <td>120</td>\n",
" <td>215.20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>60</td>\n",
" <td>2020-12-17</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>300.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>45</td>\n",
" <td>2020-12-18</td>\n",
" <td>90</td>\n",
" <td>112</td>\n",
" <td>304.68</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>60</td>\n",
" <td>2020-12-19</td>\n",
" <td>103</td>\n",
" <td>123</td>\n",
" <td>323.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>45</td>\n",
" <td>2020-12-20</td>\n",
" <td>97</td>\n",
" <td>125</td>\n",
" <td>243.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>60</td>\n",
" <td>2020-12-21</td>\n",
" <td>108</td>\n",
" <td>131</td>\n",
" <td>364.20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>45</td>\n",
" <td>NaT</td>\n",
" <td>100</td>\n",
" <td>119</td>\n",
" <td>282.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>60</td>\n",
" <td>2020-12-23</td>\n",
" <td>130</td>\n",
" <td>101</td>\n",
" <td>300.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>45</td>\n",
" <td>2020-12-24</td>\n",
" <td>105</td>\n",
" <td>132</td>\n",
" <td>246.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>60</td>\n",
" <td>2020-12-25</td>\n",
" <td>102</td>\n",
" <td>126</td>\n",
" <td>334.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>60</td>\n",
" <td>2020-12-26</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>60</td>\n",
" <td>2020-12-27</td>\n",
" <td>92</td>\n",
" <td>118</td>\n",
" <td>241.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>60</td>\n",
" <td>2020-12-28</td>\n",
" <td>103</td>\n",
" <td>132</td>\n",
" <td>304.68</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>60</td>\n",
" <td>2020-12-29</td>\n",
" <td>100</td>\n",
" <td>132</td>\n",
" <td>280.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>60</td>\n",
" <td>2020-12-30</td>\n",
" <td>102</td>\n",
" <td>129</td>\n",
" <td>380.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>60</td>\n",
" <td>2020-12-31</td>\n",
" <td>92</td>\n",
" <td>115</td>\n",
" <td>243.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Duration Date Pulse Maxpulse Calories\n",
"0 60 2020-12-01 110 130 409.10\n",
"1 60 2020-12-02 117 145 479.00\n",
"2 60 2020-12-03 103 135 340.00\n",
"3 45 2020-12-04 109 175 282.40\n",
"4 45 2020-12-05 117 148 406.00\n",
"5 60 2020-12-06 102 127 300.00\n",
"6 60 2020-12-07 110 136 374.00\n",
"7 45 2020-12-08 104 134 253.30\n",
"8 30 2020-12-09 109 133 195.10\n",
"9 60 2020-12-10 98 124 269.00\n",
"10 60 2020-12-11 103 147 329.30\n",
"11 60 2020-12-12 100 120 250.70\n",
"12 60 2020-12-12 100 120 250.70\n",
"13 60 2020-12-13 106 128 345.30\n",
"14 60 2020-12-14 104 132 379.30\n",
"15 60 2020-12-15 98 123 275.00\n",
"16 60 2020-12-16 98 120 215.20\n",
"17 60 2020-12-17 100 120 300.00\n",
"18 45 2020-12-18 90 112 304.68\n",
"19 60 2020-12-19 103 123 323.00\n",
"20 45 2020-12-20 97 125 243.00\n",
"21 60 2020-12-21 108 131 364.20\n",
"22 45 NaT 100 119 282.00\n",
"23 60 2020-12-23 130 101 300.00\n",
"24 45 2020-12-24 105 132 246.00\n",
"25 60 2020-12-25 102 126 334.50\n",
"26 60 2020-12-26 100 120 250.00\n",
"27 60 2020-12-27 92 118 241.00\n",
"28 60 2020-12-28 103 132 304.68\n",
"29 60 2020-12-29 100 132 280.00\n",
"30 60 2020-12-30 102 129 380.30\n",
"31 60 2020-12-31 92 115 243.00"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "7888f644-60a5-41e2-bd9f-acf1f5e08f5d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 False\n",
"1 False\n",
"2 False\n",
"3 False\n",
"4 False\n",
"5 False\n",
"6 False\n",
"7 False\n",
"8 False\n",
"9 False\n",
"10 False\n",
"11 False\n",
"12 True\n",
"13 False\n",
"14 False\n",
"15 False\n",
"16 False\n",
"17 False\n",
"18 False\n",
"19 False\n",
"20 False\n",
"21 False\n",
"22 False\n",
"23 False\n",
"24 False\n",
"25 False\n",
"26 False\n",
"27 False\n",
"28 False\n",
"29 False\n",
"30 False\n",
"31 False\n",
"dtype: bool\n"
]
}
],
"source": [
"# remove duplicates row 11 & 12\n",
"print(df.duplicated())"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "ff4ee9a2-dabb-4015-8b0c-5527f688bb21",
"metadata": {},
"outputs": [],
"source": [
"df.drop_duplicates(inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "44165eb4-ab0c-4be0-92d6-4c8ccf2ff389",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Duration</th>\n",
" <th>Date</th>\n",
" <th>Pulse</th>\n",
" <th>Maxpulse</th>\n",
" <th>Calories</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>60</td>\n",
" <td>2020-12-01</td>\n",
" <td>110</td>\n",
" <td>130</td>\n",
" <td>409.10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>60</td>\n",
" <td>2020-12-02</td>\n",
" <td>117</td>\n",
" <td>145</td>\n",
" <td>479.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>60</td>\n",
" <td>2020-12-03</td>\n",
" <td>103</td>\n",
" <td>135</td>\n",
" <td>340.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>45</td>\n",
" <td>2020-12-04</td>\n",
" <td>109</td>\n",
" <td>175</td>\n",
" <td>282.40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>45</td>\n",
" <td>2020-12-05</td>\n",
" <td>117</td>\n",
" <td>148</td>\n",
" <td>406.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>60</td>\n",
" <td>2020-12-06</td>\n",
" <td>102</td>\n",
" <td>127</td>\n",
" <td>300.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>60</td>\n",
" <td>2020-12-07</td>\n",
" <td>110</td>\n",
" <td>136</td>\n",
" <td>374.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>45</td>\n",
" <td>2020-12-08</td>\n",
" <td>104</td>\n",
" <td>134</td>\n",
" <td>253.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>30</td>\n",
" <td>2020-12-09</td>\n",
" <td>109</td>\n",
" <td>133</td>\n",
" <td>195.10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>60</td>\n",
" <td>2020-12-10</td>\n",
" <td>98</td>\n",
" <td>124</td>\n",
" <td>269.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>60</td>\n",
" <td>2020-12-11</td>\n",
" <td>103</td>\n",
" <td>147</td>\n",
" <td>329.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>60</td>\n",
" <td>2020-12-12</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>60</td>\n",
" <td>2020-12-13</td>\n",
" <td>106</td>\n",
" <td>128</td>\n",
" <td>345.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>60</td>\n",
" <td>2020-12-14</td>\n",
" <td>104</td>\n",
" <td>132</td>\n",
" <td>379.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>60</td>\n",
" <td>2020-12-15</td>\n",
" <td>98</td>\n",
" <td>123</td>\n",
" <td>275.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>60</td>\n",
" <td>2020-12-16</td>\n",
" <td>98</td>\n",
" <td>120</td>\n",
" <td>215.20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>60</td>\n",
" <td>2020-12-17</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>300.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>45</td>\n",
" <td>2020-12-18</td>\n",
" <td>90</td>\n",
" <td>112</td>\n",
" <td>304.68</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>60</td>\n",
" <td>2020-12-19</td>\n",
" <td>103</td>\n",
" <td>123</td>\n",
" <td>323.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>45</td>\n",
" <td>2020-12-20</td>\n",
" <td>97</td>\n",
" <td>125</td>\n",
" <td>243.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>60</td>\n",
" <td>2020-12-21</td>\n",
" <td>108</td>\n",
" <td>131</td>\n",
" <td>364.20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>45</td>\n",
" <td>NaT</td>\n",
" <td>100</td>\n",
" <td>119</td>\n",
" <td>282.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>60</td>\n",
" <td>2020-12-23</td>\n",
" <td>130</td>\n",
" <td>101</td>\n",
" <td>300.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>45</td>\n",
" <td>2020-12-24</td>\n",
" <td>105</td>\n",
" <td>132</td>\n",
" <td>246.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>60</td>\n",
" <td>2020-12-25</td>\n",
" <td>102</td>\n",
" <td>126</td>\n",
" <td>334.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>60</td>\n",
" <td>2020-12-26</td>\n",
" <td>100</td>\n",
" <td>120</td>\n",
" <td>250.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>60</td>\n",
" <td>2020-12-27</td>\n",
" <td>92</td>\n",
" <td>118</td>\n",
" <td>241.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>60</td>\n",
" <td>2020-12-28</td>\n",
" <td>103</td>\n",
" <td>132</td>\n",
" <td>304.68</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>60</td>\n",
" <td>2020-12-29</td>\n",
" <td>100</td>\n",
" <td>132</td>\n",
" <td>280.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>60</td>\n",
" <td>2020-12-30</td>\n",
" <td>102</td>\n",
" <td>129</td>\n",
" <td>380.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>60</td>\n",
" <td>2020-12-31</td>\n",
" <td>92</td>\n",
" <td>115</td>\n",
" <td>243.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Duration Date Pulse Maxpulse Calories\n",
"0 60 2020-12-01 110 130 409.10\n",
"1 60 2020-12-02 117 145 479.00\n",
"2 60 2020-12-03 103 135 340.00\n",
"3 45 2020-12-04 109 175 282.40\n",
"4 45 2020-12-05 117 148 406.00\n",
"5 60 2020-12-06 102 127 300.00\n",
"6 60 2020-12-07 110 136 374.00\n",
"7 45 2020-12-08 104 134 253.30\n",
"8 30 2020-12-09 109 133 195.10\n",
"9 60 2020-12-10 98 124 269.00\n",
"10 60 2020-12-11 103 147 329.30\n",
"11 60 2020-12-12 100 120 250.70\n",
"13 60 2020-12-13 106 128 345.30\n",
"14 60 2020-12-14 104 132 379.30\n",
"15 60 2020-12-15 98 123 275.00\n",
"16 60 2020-12-16 98 120 215.20\n",
"17 60 2020-12-17 100 120 300.00\n",
"18 45 2020-12-18 90 112 304.68\n",
"19 60 2020-12-19 103 123 323.00\n",
"20 45 2020-12-20 97 125 243.00\n",
"21 60 2020-12-21 108 131 364.20\n",
"22 45 NaT 100 119 282.00\n",
"23 60 2020-12-23 130 101 300.00\n",
"24 45 2020-12-24 105 132 246.00\n",
"25 60 2020-12-25 102 126 334.50\n",
"26 60 2020-12-26 100 120 250.00\n",
"27 60 2020-12-27 92 118 241.00\n",
"28 60 2020-12-28 103 132 304.68\n",
"29 60 2020-12-29 100 132 280.00\n",
"30 60 2020-12-30 102 129 380.30\n",
"31 60 2020-12-31 92 115 243.00"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "3033c2a4-18f1-4fcd-be75-f71f95c9097f",
"metadata": {},
"outputs": [],
"source": [
"df.to_csv('cleaned.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "549ea6b3-3903-4b74-88ad-74c60e7d862e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}