{ "cells": [ { "cell_type": "markdown", "id": "b0c0ae08-2fb5-47f5-a5ce-1a66e35791a4", "metadata": {}, "source": [ "### Cleaning Data" ] }, { "cell_type": "code", "execution_count": 1, "id": "f9998a78-ae01-4531-b325-637b6d5ee86d", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "id": "9516a86a-ed6a-4f79-b631-3195daec258c", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('https://gist.githubusercontent.com/maltegrosse/bdfd2c6a5e3bff315d92cd27c2461a48/raw/49d5672953360934601b3d252c9b78121eed10db/data.csv')" ] }, { "cell_type": "code", "execution_count": 3, "id": "ea25a32c-70d3-479d-8d11-7e487f13f50c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DurationDatePulseMaxpulseCalories
060'2020/12/01'110130409.1
160'2020/12/02'117145479.0
260'2020/12/03'103135340.0
345'2020/12/04'109175282.4
445'2020/12/05'117148406.0
560'2020/12/06'102127300.0
660'2020/12/07'110136374.0
7450'2020/12/08'104134253.3
830'2020/12/09'109133195.1
960'2020/12/10'98124269.0
1060'2020/12/11'103147329.3
1160'2020/12/12'100120250.7
1260'2020/12/12'100120250.7
1360'2020/12/13'106128345.3
1460'2020/12/14'104132379.3
1560'2020/12/15'98123275.0
1660'2020/12/16'98120215.2
1760'2020/12/17'100120300.0
1845'2020/12/18'90112NaN
1960'2020/12/19'103123323.0
2045'2020/12/20'97125243.0
2160'2020/12/21'108131364.2
2245NaN100119282.0
2360'2020/12/23'130101300.0
2445'2020/12/24'105132246.0
2560'2020/12/25'102126334.5
266020201226100120250.0
2760'2020/12/27'92118241.0
2860'2020/12/28'103132NaN
2960'2020/12/29'100132280.0
3060'2020/12/30'102129380.3
3160'2020/12/31'92115243.0
\n", "
" ], "text/plain": [ " Duration Date Pulse Maxpulse Calories\n", "0 60 '2020/12/01' 110 130 409.1\n", "1 60 '2020/12/02' 117 145 479.0\n", "2 60 '2020/12/03' 103 135 340.0\n", "3 45 '2020/12/04' 109 175 282.4\n", "4 45 '2020/12/05' 117 148 406.0\n", "5 60 '2020/12/06' 102 127 300.0\n", "6 60 '2020/12/07' 110 136 374.0\n", "7 450 '2020/12/08' 104 134 253.3\n", "8 30 '2020/12/09' 109 133 195.1\n", "9 60 '2020/12/10' 98 124 269.0\n", "10 60 '2020/12/11' 103 147 329.3\n", "11 60 '2020/12/12' 100 120 250.7\n", "12 60 '2020/12/12' 100 120 250.7\n", "13 60 '2020/12/13' 106 128 345.3\n", "14 60 '2020/12/14' 104 132 379.3\n", "15 60 '2020/12/15' 98 123 275.0\n", "16 60 '2020/12/16' 98 120 215.2\n", "17 60 '2020/12/17' 100 120 300.0\n", "18 45 '2020/12/18' 90 112 NaN\n", "19 60 '2020/12/19' 103 123 323.0\n", "20 45 '2020/12/20' 97 125 243.0\n", "21 60 '2020/12/21' 108 131 364.2\n", "22 45 NaN 100 119 282.0\n", "23 60 '2020/12/23' 130 101 300.0\n", "24 45 '2020/12/24' 105 132 246.0\n", "25 60 '2020/12/25' 102 126 334.5\n", "26 60 20201226 100 120 250.0\n", "27 60 '2020/12/27' 92 118 241.0\n", "28 60 '2020/12/28' 103 132 NaN\n", "29 60 '2020/12/29' 100 132 280.0\n", "30 60 '2020/12/30' 102 129 380.3\n", "31 60 '2020/12/31' 92 115 243.0" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 4, "id": "2baf29d8-cd8f-4dfd-931a-c413a995320e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DurationDatePulseMaxpulseCalories
060'2020/12/01'110130409.1
160'2020/12/02'117145479.0
260'2020/12/03'103135340.0
345'2020/12/04'109175282.4
445'2020/12/05'117148406.0
560'2020/12/06'102127300.0
660'2020/12/07'110136374.0
7450'2020/12/08'104134253.3
830'2020/12/09'109133195.1
960'2020/12/10'98124269.0
1060'2020/12/11'103147329.3
1160'2020/12/12'100120250.7
1260'2020/12/12'100120250.7
1360'2020/12/13'106128345.3
1460'2020/12/14'104132379.3
1560'2020/12/15'98123275.0
1660'2020/12/16'98120215.2
1760'2020/12/17'100120300.0
1960'2020/12/19'103123323.0
2045'2020/12/20'97125243.0
2160'2020/12/21'108131364.2
2360'2020/12/23'130101300.0
2445'2020/12/24'105132246.0
2560'2020/12/25'102126334.5
266020201226100120250.0
2760'2020/12/27'92118241.0
2960'2020/12/29'100132280.0
3060'2020/12/30'102129380.3
3160'2020/12/31'92115243.0
\n", "
" ], "text/plain": [ " Duration Date Pulse Maxpulse Calories\n", "0 60 '2020/12/01' 110 130 409.1\n", "1 60 '2020/12/02' 117 145 479.0\n", "2 60 '2020/12/03' 103 135 340.0\n", "3 45 '2020/12/04' 109 175 282.4\n", "4 45 '2020/12/05' 117 148 406.0\n", "5 60 '2020/12/06' 102 127 300.0\n", "6 60 '2020/12/07' 110 136 374.0\n", "7 450 '2020/12/08' 104 134 253.3\n", "8 30 '2020/12/09' 109 133 195.1\n", "9 60 '2020/12/10' 98 124 269.0\n", "10 60 '2020/12/11' 103 147 329.3\n", "11 60 '2020/12/12' 100 120 250.7\n", "12 60 '2020/12/12' 100 120 250.7\n", "13 60 '2020/12/13' 106 128 345.3\n", "14 60 '2020/12/14' 104 132 379.3\n", "15 60 '2020/12/15' 98 123 275.0\n", "16 60 '2020/12/16' 98 120 215.2\n", "17 60 '2020/12/17' 100 120 300.0\n", "19 60 '2020/12/19' 103 123 323.0\n", "20 45 '2020/12/20' 97 125 243.0\n", "21 60 '2020/12/21' 108 131 364.2\n", "23 60 '2020/12/23' 130 101 300.0\n", "24 45 '2020/12/24' 105 132 246.0\n", "25 60 '2020/12/25' 102 126 334.5\n", "26 60 20201226 100 120 250.0\n", "27 60 '2020/12/27' 92 118 241.0\n", "29 60 '2020/12/29' 100 132 280.0\n", "30 60 '2020/12/30' 102 129 380.3\n", "31 60 '2020/12/31' 92 115 243.0" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# drop null/NaN\n", "new_df = df.dropna()\n", "new_df" ] }, { "cell_type": "code", "execution_count": 5, "id": "37533007-2851-49da-8fca-2e9d3b74c406", "metadata": {}, "outputs": [], "source": [ "# hint df.dropna(inplace = True) <- manipulates orginal df" ] }, { "cell_type": "code", "execution_count": 6, "id": "e94f0608-1928-4dec-b28c-3f56d72b1867", "metadata": {}, "outputs": [], "source": [ "# fill missing values\n", "# df.fillna(130, inplace = True)" ] }, { "cell_type": "code", "execution_count": 9, "id": "025cec14-2687-4ec5-9fa9-f10f1da927ea", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DurationDatePulseMaxpulseCalories
060'2020/12/01'110130409.10
160'2020/12/02'117145479.00
260'2020/12/03'103135340.00
345'2020/12/04'109175282.40
445'2020/12/05'117148406.00
560'2020/12/06'102127300.00
660'2020/12/07'110136374.00
7450'2020/12/08'104134253.30
830'2020/12/09'109133195.10
960'2020/12/10'98124269.00
1060'2020/12/11'103147329.30
1160'2020/12/12'100120250.70
1260'2020/12/12'100120250.70
1360'2020/12/13'106128345.30
1460'2020/12/14'104132379.30
1560'2020/12/15'98123275.00
1660'2020/12/16'98120215.20
1760'2020/12/17'100120300.00
1845'2020/12/18'90112304.68
1960'2020/12/19'103123323.00
2045'2020/12/20'97125243.00
2160'2020/12/21'108131364.20
2245NaN100119282.00
2360'2020/12/23'130101300.00
2445'2020/12/24'105132246.00
2560'2020/12/25'102126334.50
266020201226100120250.00
2760'2020/12/27'92118241.00
2860'2020/12/28'103132304.68
2960'2020/12/29'100132280.00
3060'2020/12/30'102129380.30
3160'2020/12/31'92115243.00
\n", "
" ], "text/plain": [ " Duration Date Pulse Maxpulse Calories\n", "0 60 '2020/12/01' 110 130 409.10\n", "1 60 '2020/12/02' 117 145 479.00\n", "2 60 '2020/12/03' 103 135 340.00\n", "3 45 '2020/12/04' 109 175 282.40\n", "4 45 '2020/12/05' 117 148 406.00\n", "5 60 '2020/12/06' 102 127 300.00\n", "6 60 '2020/12/07' 110 136 374.00\n", "7 450 '2020/12/08' 104 134 253.30\n", "8 30 '2020/12/09' 109 133 195.10\n", "9 60 '2020/12/10' 98 124 269.00\n", "10 60 '2020/12/11' 103 147 329.30\n", "11 60 '2020/12/12' 100 120 250.70\n", "12 60 '2020/12/12' 100 120 250.70\n", "13 60 '2020/12/13' 106 128 345.30\n", "14 60 '2020/12/14' 104 132 379.30\n", "15 60 '2020/12/15' 98 123 275.00\n", "16 60 '2020/12/16' 98 120 215.20\n", "17 60 '2020/12/17' 100 120 300.00\n", "18 45 '2020/12/18' 90 112 304.68\n", "19 60 '2020/12/19' 103 123 323.00\n", "20 45 '2020/12/20' 97 125 243.00\n", "21 60 '2020/12/21' 108 131 364.20\n", "22 45 NaN 100 119 282.00\n", "23 60 '2020/12/23' 130 101 300.00\n", "24 45 '2020/12/24' 105 132 246.00\n", "25 60 '2020/12/25' 102 126 334.50\n", "26 60 20201226 100 120 250.00\n", "27 60 '2020/12/27' 92 118 241.00\n", "28 60 '2020/12/28' 103 132 304.68\n", "29 60 '2020/12/29' 100 132 280.00\n", "30 60 '2020/12/30' 102 129 380.30\n", "31 60 '2020/12/31' 92 115 243.00" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x = df[\"Calories\"].mean()\n", "\n", "df[\"Calories\"].fillna(x, inplace=True)\n", "df" ] }, { "cell_type": "code", "execution_count": 10, "id": "d2e87f3b-ef58-4128-b52f-799056e56de8", "metadata": {}, "outputs": [], "source": [ "x = df[\"Calories\"].median()\n", "\n", "df[\"Calories\"].fillna(x, inplace = True)" ] }, { "cell_type": "code", "execution_count": 12, "id": "c42df786-aa1b-4174-b436-566421f1683b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DurationDatePulseMaxpulseCalories
0602020-12-01110130409.10
1602020-12-02117145479.00
2602020-12-03103135340.00
3452020-12-04109175282.40
4452020-12-05117148406.00
5602020-12-06102127300.00
6602020-12-07110136374.00
74502020-12-08104134253.30
8302020-12-09109133195.10
9602020-12-1098124269.00
10602020-12-11103147329.30
11602020-12-12100120250.70
12602020-12-12100120250.70
13602020-12-13106128345.30
14602020-12-14104132379.30
15602020-12-1598123275.00
16602020-12-1698120215.20
17602020-12-17100120300.00
18452020-12-1890112304.68
19602020-12-19103123323.00
20452020-12-2097125243.00
21602020-12-21108131364.20
2245NaT100119282.00
23602020-12-23130101300.00
24452020-12-24105132246.00
25602020-12-25102126334.50
26602020-12-26100120250.00
27602020-12-2792118241.00
28602020-12-28103132304.68
29602020-12-29100132280.00
30602020-12-30102129380.30
31602020-12-3192115243.00
\n", "
" ], "text/plain": [ " Duration Date Pulse Maxpulse Calories\n", "0 60 2020-12-01 110 130 409.10\n", "1 60 2020-12-02 117 145 479.00\n", "2 60 2020-12-03 103 135 340.00\n", "3 45 2020-12-04 109 175 282.40\n", "4 45 2020-12-05 117 148 406.00\n", "5 60 2020-12-06 102 127 300.00\n", "6 60 2020-12-07 110 136 374.00\n", "7 450 2020-12-08 104 134 253.30\n", "8 30 2020-12-09 109 133 195.10\n", "9 60 2020-12-10 98 124 269.00\n", "10 60 2020-12-11 103 147 329.30\n", "11 60 2020-12-12 100 120 250.70\n", "12 60 2020-12-12 100 120 250.70\n", "13 60 2020-12-13 106 128 345.30\n", "14 60 2020-12-14 104 132 379.30\n", "15 60 2020-12-15 98 123 275.00\n", "16 60 2020-12-16 98 120 215.20\n", "17 60 2020-12-17 100 120 300.00\n", "18 45 2020-12-18 90 112 304.68\n", "19 60 2020-12-19 103 123 323.00\n", "20 45 2020-12-20 97 125 243.00\n", "21 60 2020-12-21 108 131 364.20\n", "22 45 NaT 100 119 282.00\n", "23 60 2020-12-23 130 101 300.00\n", "24 45 2020-12-24 105 132 246.00\n", "25 60 2020-12-25 102 126 334.50\n", "26 60 2020-12-26 100 120 250.00\n", "27 60 2020-12-27 92 118 241.00\n", "28 60 2020-12-28 103 132 304.68\n", "29 60 2020-12-29 100 132 280.00\n", "30 60 2020-12-30 102 129 380.30\n", "31 60 2020-12-31 92 115 243.00" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# convert into proper data type\n", "df['Date'] = pd.to_datetime(df['Date'])\n", "df" ] }, { "cell_type": "code", "execution_count": 13, "id": "6508edc2-f7f1-469b-a094-1b6c98a155e3", "metadata": {}, "outputs": [], "source": [ "# remove missing value according to a column\n", "# df.dropna(subset=['Date'], inplace = True)" ] }, { "cell_type": "markdown", "id": "725032e8-c03e-428e-a928-f5c2533a3446", "metadata": {}, "source": [ "#### Fixing Wrong Data" ] }, { "cell_type": "code", "execution_count": 15, "id": "3367d5c9-90f8-4fb1-9c2b-bae2bdaeb7bf", "metadata": {}, "outputs": [], "source": [ "# row 7: 450 duration!\n", "df.loc[7, 'Duration'] = 45" ] }, { "cell_type": "code", "execution_count": 16, "id": "1a9ce891-9275-4539-a23c-4826fb258c1d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DurationDatePulseMaxpulseCalories
0602020-12-01110130409.10
1602020-12-02117145479.00
2602020-12-03103135340.00
3452020-12-04109175282.40
4452020-12-05117148406.00
5602020-12-06102127300.00
6602020-12-07110136374.00
7452020-12-08104134253.30
8302020-12-09109133195.10
9602020-12-1098124269.00
10602020-12-11103147329.30
11602020-12-12100120250.70
12602020-12-12100120250.70
13602020-12-13106128345.30
14602020-12-14104132379.30
15602020-12-1598123275.00
16602020-12-1698120215.20
17602020-12-17100120300.00
18452020-12-1890112304.68
19602020-12-19103123323.00
20452020-12-2097125243.00
21602020-12-21108131364.20
2245NaT100119282.00
23602020-12-23130101300.00
24452020-12-24105132246.00
25602020-12-25102126334.50
26602020-12-26100120250.00
27602020-12-2792118241.00
28602020-12-28103132304.68
29602020-12-29100132280.00
30602020-12-30102129380.30
31602020-12-3192115243.00
\n", "
" ], "text/plain": [ " Duration Date Pulse Maxpulse Calories\n", "0 60 2020-12-01 110 130 409.10\n", "1 60 2020-12-02 117 145 479.00\n", "2 60 2020-12-03 103 135 340.00\n", "3 45 2020-12-04 109 175 282.40\n", "4 45 2020-12-05 117 148 406.00\n", "5 60 2020-12-06 102 127 300.00\n", "6 60 2020-12-07 110 136 374.00\n", "7 45 2020-12-08 104 134 253.30\n", "8 30 2020-12-09 109 133 195.10\n", "9 60 2020-12-10 98 124 269.00\n", "10 60 2020-12-11 103 147 329.30\n", "11 60 2020-12-12 100 120 250.70\n", "12 60 2020-12-12 100 120 250.70\n", "13 60 2020-12-13 106 128 345.30\n", "14 60 2020-12-14 104 132 379.30\n", "15 60 2020-12-15 98 123 275.00\n", "16 60 2020-12-16 98 120 215.20\n", "17 60 2020-12-17 100 120 300.00\n", "18 45 2020-12-18 90 112 304.68\n", "19 60 2020-12-19 103 123 323.00\n", "20 45 2020-12-20 97 125 243.00\n", "21 60 2020-12-21 108 131 364.20\n", "22 45 NaT 100 119 282.00\n", "23 60 2020-12-23 130 101 300.00\n", "24 45 2020-12-24 105 132 246.00\n", "25 60 2020-12-25 102 126 334.50\n", "26 60 2020-12-26 100 120 250.00\n", "27 60 2020-12-27 92 118 241.00\n", "28 60 2020-12-28 103 132 304.68\n", "29 60 2020-12-29 100 132 280.00\n", "30 60 2020-12-30 102 129 380.30\n", "31 60 2020-12-31 92 115 243.00" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 18, "id": "7888f644-60a5-41e2-bd9f-acf1f5e08f5d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 False\n", "1 False\n", "2 False\n", "3 False\n", "4 False\n", "5 False\n", "6 False\n", "7 False\n", "8 False\n", "9 False\n", "10 False\n", "11 False\n", "12 True\n", "13 False\n", "14 False\n", "15 False\n", "16 False\n", "17 False\n", "18 False\n", "19 False\n", "20 False\n", "21 False\n", "22 False\n", "23 False\n", "24 False\n", "25 False\n", "26 False\n", "27 False\n", "28 False\n", "29 False\n", "30 False\n", "31 False\n", "dtype: bool\n" ] } ], "source": [ "# remove duplicates row 11 & 12\n", "print(df.duplicated())" ] }, { "cell_type": "code", "execution_count": 19, "id": "ff4ee9a2-dabb-4015-8b0c-5527f688bb21", "metadata": {}, "outputs": [], "source": [ "df.drop_duplicates(inplace = True)" ] }, { "cell_type": "code", "execution_count": 20, "id": "44165eb4-ab0c-4be0-92d6-4c8ccf2ff389", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DurationDatePulseMaxpulseCalories
0602020-12-01110130409.10
1602020-12-02117145479.00
2602020-12-03103135340.00
3452020-12-04109175282.40
4452020-12-05117148406.00
5602020-12-06102127300.00
6602020-12-07110136374.00
7452020-12-08104134253.30
8302020-12-09109133195.10
9602020-12-1098124269.00
10602020-12-11103147329.30
11602020-12-12100120250.70
13602020-12-13106128345.30
14602020-12-14104132379.30
15602020-12-1598123275.00
16602020-12-1698120215.20
17602020-12-17100120300.00
18452020-12-1890112304.68
19602020-12-19103123323.00
20452020-12-2097125243.00
21602020-12-21108131364.20
2245NaT100119282.00
23602020-12-23130101300.00
24452020-12-24105132246.00
25602020-12-25102126334.50
26602020-12-26100120250.00
27602020-12-2792118241.00
28602020-12-28103132304.68
29602020-12-29100132280.00
30602020-12-30102129380.30
31602020-12-3192115243.00
\n", "
" ], "text/plain": [ " Duration Date Pulse Maxpulse Calories\n", "0 60 2020-12-01 110 130 409.10\n", "1 60 2020-12-02 117 145 479.00\n", "2 60 2020-12-03 103 135 340.00\n", "3 45 2020-12-04 109 175 282.40\n", "4 45 2020-12-05 117 148 406.00\n", "5 60 2020-12-06 102 127 300.00\n", "6 60 2020-12-07 110 136 374.00\n", "7 45 2020-12-08 104 134 253.30\n", "8 30 2020-12-09 109 133 195.10\n", "9 60 2020-12-10 98 124 269.00\n", "10 60 2020-12-11 103 147 329.30\n", "11 60 2020-12-12 100 120 250.70\n", "13 60 2020-12-13 106 128 345.30\n", "14 60 2020-12-14 104 132 379.30\n", "15 60 2020-12-15 98 123 275.00\n", "16 60 2020-12-16 98 120 215.20\n", "17 60 2020-12-17 100 120 300.00\n", "18 45 2020-12-18 90 112 304.68\n", "19 60 2020-12-19 103 123 323.00\n", "20 45 2020-12-20 97 125 243.00\n", "21 60 2020-12-21 108 131 364.20\n", "22 45 NaT 100 119 282.00\n", "23 60 2020-12-23 130 101 300.00\n", "24 45 2020-12-24 105 132 246.00\n", "25 60 2020-12-25 102 126 334.50\n", "26 60 2020-12-26 100 120 250.00\n", "27 60 2020-12-27 92 118 241.00\n", "28 60 2020-12-28 103 132 304.68\n", "29 60 2020-12-29 100 132 280.00\n", "30 60 2020-12-30 102 129 380.30\n", "31 60 2020-12-31 92 115 243.00" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 22, "id": "3033c2a4-18f1-4fcd-be75-f71f95c9097f", "metadata": {}, "outputs": [], "source": [ "df.to_csv('cleaned.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "549ea6b3-3903-4b74-88ad-74c60e7d862e", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.8" } }, "nbformat": 4, "nbformat_minor": 5 }