In [1]:
#import and alias

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
#set the seed - numpy
np.random.seed(1)

#style sheet - matplotlib.pyplot
plt.rcParams['figure.figsize'] = [12.0, 6.0]
plt.rcParams['figure.dpi'] = 80
plt.style.use('seaborn-darkgrid')

#context (style) - seaborn
sns.set_context("notebook")
In [3]:
#COVID-19/archived_data/archived_time_series
confirmed_df = pd.read_csv('./csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
deaths_df = pd.read_csv('./csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
recovered_df = pd.read_csv('./csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')
In [4]:
confirmed_df
Out[4]:
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 3/24/20 3/25/20 3/26/20 3/27/20 3/28/20 3/29/20 3/30/20 3/31/20 4/1/20 4/2/20
0 NaN Afghanistan 33.000000 65.000000 0 0 0 0 0 0 ... 74 84 94 110 110 120 170 174 237 273
1 NaN Albania 41.153300 20.168300 0 0 0 0 0 0 ... 123 146 174 186 197 212 223 243 259 277
2 NaN Algeria 28.033900 1.659600 0 0 0 0 0 0 ... 264 302 367 409 454 511 584 716 847 986
3 NaN Andorra 42.506300 1.521800 0 0 0 0 0 0 ... 164 188 224 267 308 334 370 376 390 428
4 NaN Angola -11.202700 17.873900 0 0 0 0 0 0 ... 3 3 4 4 5 7 7 7 8 8
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
253 NaN Botswana -22.328500 24.684900 0 0 0 0 0 0 ... 0 0 0 0 0 0 3 4 4 4
254 NaN Burundi -3.373100 29.918900 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 2 2 3
255 NaN Sierra Leone 8.460555 -11.779889 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 2 2
256 Bonaire, Sint Eustatius and Saba Netherlands 12.178400 -68.238500 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 2
257 NaN Malawi -13.254308 34.301525 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 3

258 rows × 76 columns

In [5]:
deaths_df
Out[5]:
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 3/24/20 3/25/20 3/26/20 3/27/20 3/28/20 3/29/20 3/30/20 3/31/20 4/1/20 4/2/20
0 NaN Afghanistan 33.000000 65.000000 0 0 0 0 0 0 ... 1 2 4 4 4 4 4 4 4 6
1 NaN Albania 41.153300 20.168300 0 0 0 0 0 0 ... 5 5 6 8 10 10 11 15 15 16
2 NaN Algeria 28.033900 1.659600 0 0 0 0 0 0 ... 19 21 25 26 29 31 35 44 58 86
3 NaN Andorra 42.506300 1.521800 0 0 0 0 0 0 ... 1 1 3 3 3 6 8 12 14 15
4 NaN Angola -11.202700 17.873900 0 0 0 0 0 0 ... 0 0 0 0 0 2 2 2 2 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
253 NaN Botswana -22.328500 24.684900 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 1 1
254 NaN Burundi -3.373100 29.918900 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
255 NaN Sierra Leone 8.460555 -11.779889 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
256 Bonaire, Sint Eustatius and Saba Netherlands 12.178400 -68.238500 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
257 NaN Malawi -13.254308 34.301525 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

258 rows × 76 columns

In [6]:
recovered_df
Out[6]:
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 3/24/20 3/25/20 3/26/20 3/27/20 3/28/20 3/29/20 3/30/20 3/31/20 4/1/20 4/2/20
0 NaN Afghanistan 33.000000 65.000000 0 0 0 0 0 0 ... 1 2 2 2 2 2 2 5 5 10
1 NaN Albania 41.153300 20.168300 0 0 0 0 0 0 ... 10 17 17 31 31 33 44 52 67 76
2 NaN Algeria 28.033900 1.659600 0 0 0 0 0 0 ... 24 65 29 29 31 31 37 46 61 61
3 NaN Andorra 42.506300 1.521800 0 0 0 0 0 0 ... 1 1 1 1 1 1 10 10 10 10
4 NaN Angola -11.202700 17.873900 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 1 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
239 NaN Botswana -22.328500 24.684900 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
240 NaN Burundi -3.373100 29.918900 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
241 NaN Sierra Leone 8.460555 -11.779889 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
242 Bonaire, Sint Eustatius and Saba Netherlands 12.178400 -68.238500 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
243 NaN Malawi -13.254308 34.301525 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

244 rows × 76 columns

In [7]:
confirmed_data_df = confirmed_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], var_name="Date", value_name="Confirmed")
death_data_df = deaths_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], var_name="Date", value_name="Deaths")
recovered_data_df = recovered_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], var_name="Date",  value_name="Recovered")
In [8]:
confirmed_data_df
Out[8]:
Province/State Country/Region Lat Long Date Confirmed
0 NaN Afghanistan 33.000000 65.000000 1/22/20 0
1 NaN Albania 41.153300 20.168300 1/22/20 0
2 NaN Algeria 28.033900 1.659600 1/22/20 0
3 NaN Andorra 42.506300 1.521800 1/22/20 0
4 NaN Angola -11.202700 17.873900 1/22/20 0
... ... ... ... ... ... ...
18571 NaN Botswana -22.328500 24.684900 4/2/20 4
18572 NaN Burundi -3.373100 29.918900 4/2/20 3
18573 NaN Sierra Leone 8.460555 -11.779889 4/2/20 2
18574 Bonaire, Sint Eustatius and Saba Netherlands 12.178400 -68.238500 4/2/20 2
18575 NaN Malawi -13.254308 34.301525 4/2/20 3

18576 rows × 6 columns

In [9]:
death_data_df
Out[9]:
Province/State Country/Region Lat Long Date Deaths
0 NaN Afghanistan 33.000000 65.000000 1/22/20 0
1 NaN Albania 41.153300 20.168300 1/22/20 0
2 NaN Algeria 28.033900 1.659600 1/22/20 0
3 NaN Andorra 42.506300 1.521800 1/22/20 0
4 NaN Angola -11.202700 17.873900 1/22/20 0
... ... ... ... ... ... ...
18571 NaN Botswana -22.328500 24.684900 4/2/20 1
18572 NaN Burundi -3.373100 29.918900 4/2/20 0
18573 NaN Sierra Leone 8.460555 -11.779889 4/2/20 0
18574 Bonaire, Sint Eustatius and Saba Netherlands 12.178400 -68.238500 4/2/20 0
18575 NaN Malawi -13.254308 34.301525 4/2/20 0

18576 rows × 6 columns

In [10]:
recovered_data_df
Out[10]:
Province/State Country/Region Lat Long Date Recovered
0 NaN Afghanistan 33.000000 65.000000 1/22/20 0
1 NaN Albania 41.153300 20.168300 1/22/20 0
2 NaN Algeria 28.033900 1.659600 1/22/20 0
3 NaN Andorra 42.506300 1.521800 1/22/20 0
4 NaN Angola -11.202700 17.873900 1/22/20 0
... ... ... ... ... ... ...
17563 NaN Botswana -22.328500 24.684900 4/2/20 0
17564 NaN Burundi -3.373100 29.918900 4/2/20 0
17565 NaN Sierra Leone 8.460555 -11.779889 4/2/20 0
17566 Bonaire, Sint Eustatius and Saba Netherlands 12.178400 -68.238500 4/2/20 0
17567 NaN Malawi -13.254308 34.301525 4/2/20 0

17568 rows × 6 columns

In [11]:
confirmed_data_df[confirmed_data_df["Confirmed"].isna()]
Out[11]:
Province/State Country/Region Lat Long Date Confirmed
In [12]:
death_data_df[death_data_df['Deaths'].isna()]
Out[12]:
Province/State Country/Region Lat Long Date Deaths
In [13]:
recovered_data_df[recovered_data_df['Recovered'].isna()]
Out[13]:
Province/State Country/Region Lat Long Date Recovered
In [14]:
all_data_df = pd.concat([confirmed_data_df, death_data_df['Deaths'] , recovered_data_df['Recovered']], axis=1).reset_index().drop(['index'], axis=1)
In [15]:
all_data_df
Out[15]:
Province/State Country/Region Lat Long Date Confirmed Deaths Recovered
0 NaN Afghanistan 33.000000 65.000000 1/22/20 0 0 0.0
1 NaN Albania 41.153300 20.168300 1/22/20 0 0 0.0
2 NaN Algeria 28.033900 1.659600 1/22/20 0 0 0.0
3 NaN Andorra 42.506300 1.521800 1/22/20 0 0 0.0
4 NaN Angola -11.202700 17.873900 1/22/20 0 0 0.0
... ... ... ... ... ... ... ... ...
18571 NaN Botswana -22.328500 24.684900 4/2/20 4 1 NaN
18572 NaN Burundi -3.373100 29.918900 4/2/20 3 0 NaN
18573 NaN Sierra Leone 8.460555 -11.779889 4/2/20 2 0 NaN
18574 Bonaire, Sint Eustatius and Saba Netherlands 12.178400 -68.238500 4/2/20 2 0 NaN
18575 NaN Malawi -13.254308 34.301525 4/2/20 3 0 NaN

18576 rows × 8 columns

In [16]:
all_data_df[all_data_df["Province/State"].isna()]
Out[16]:
Province/State Country/Region Lat Long Date Confirmed Deaths Recovered
0 NaN Afghanistan 33.000000 65.000000 1/22/20 0 0 0.0
1 NaN Albania 41.153300 20.168300 1/22/20 0 0 0.0
2 NaN Algeria 28.033900 1.659600 1/22/20 0 0 0.0
3 NaN Andorra 42.506300 1.521800 1/22/20 0 0 0.0
4 NaN Angola -11.202700 17.873900 1/22/20 0 0 0.0
... ... ... ... ... ... ... ... ...
18570 NaN MS Zaandam 0.000000 0.000000 4/2/20 9 2 NaN
18571 NaN Botswana -22.328500 24.684900 4/2/20 4 1 NaN
18572 NaN Burundi -3.373100 29.918900 4/2/20 3 0 NaN
18573 NaN Sierra Leone 8.460555 -11.779889 4/2/20 2 0 NaN
18575 NaN Malawi -13.254308 34.301525 4/2/20 3 0 NaN

12816 rows × 8 columns

In [17]:
all_data_df[all_data_df["Province/State"].isna()].head(50)
Out[17]:
Province/State Country/Region Lat Long Date Confirmed Deaths Recovered
0 NaN Afghanistan 33.0000 65.0000 1/22/20 0 0 0.0
1 NaN Albania 41.1533 20.1683 1/22/20 0 0 0.0
2 NaN Algeria 28.0339 1.6596 1/22/20 0 0 0.0
3 NaN Andorra 42.5063 1.5218 1/22/20 0 0 0.0
4 NaN Angola -11.2027 17.8739 1/22/20 0 0 0.0
5 NaN Antigua and Barbuda 17.0608 -61.7964 1/22/20 0 0 0.0
6 NaN Argentina -38.4161 -63.6167 1/22/20 0 0 0.0
7 NaN Armenia 40.0691 45.0382 1/22/20 0 0 0.0
16 NaN Austria 47.5162 14.5501 1/22/20 0 0 0.0
17 NaN Azerbaijan 40.1431 47.5769 1/22/20 0 0 0.0
18 NaN Bahamas 25.0343 -77.3963 1/22/20 0 0 0.0
19 NaN Bahrain 26.0275 50.5500 1/22/20 0 0 0.0
20 NaN Bangladesh 23.6850 90.3563 1/22/20 0 0 0.0
21 NaN Barbados 13.1939 -59.5432 1/22/20 0 0 0.0
22 NaN Belarus 53.7098 27.9534 1/22/20 0 0 0.0
23 NaN Belgium 50.8333 4.0000 1/22/20 0 0 0.0
24 NaN Benin 9.3077 2.3158 1/22/20 0 0 0.0
25 NaN Bhutan 27.5142 90.4336 1/22/20 0 0 0.0
26 NaN Bolivia -16.2902 -63.5887 1/22/20 0 0 0.0
27 NaN Bosnia and Herzegovina 43.9159 17.6791 1/22/20 0 0 0.0
28 NaN Brazil -14.2350 -51.9253 1/22/20 0 0 0.0
29 NaN Brunei 4.5353 114.7277 1/22/20 0 0 0.0
30 NaN Bulgaria 42.7339 25.4858 1/22/20 0 0 0.0
31 NaN Burkina Faso 12.2383 -1.5616 1/22/20 0 0 0.0
32 NaN Cabo Verde 16.5388 -23.0418 1/22/20 0 0 0.0
33 NaN Cambodia 11.5500 104.9167 1/22/20 0 0 0.0
34 NaN Cameroon 3.8480 11.5021 1/22/20 0 0 0.0
46 NaN Central African Republic 6.6111 20.9394 1/22/20 0 0 0.0
47 NaN Chad 15.4542 18.7322 1/22/20 0 0 0.0
48 NaN Chile -35.6751 -71.5430 1/22/20 0 0 0.0
82 NaN Colombia 4.5709 -74.2973 1/22/20 0 0 0.0
83 NaN Congo (Brazzaville) -4.0383 21.7587 1/22/20 0 0 0.0
84 NaN Congo (Kinshasa) -4.0383 21.7587 1/22/20 0 0 0.0
85 NaN Costa Rica 9.7489 -83.7534 1/22/20 0 0 0.0
86 NaN Cote d'Ivoire 7.5400 -5.5471 1/22/20 0 0 0.0
87 NaN Croatia 45.1000 15.2000 1/22/20 0 0 0.0
88 NaN Diamond Princess 0.0000 0.0000 1/22/20 0 0 0.0
89 NaN Cuba 22.0000 -80.0000 1/22/20 0 0 0.0
90 NaN Cyprus 35.1264 33.4299 1/22/20 0 0 0.0
91 NaN Czechia 49.8175 15.4730 1/22/20 0 0 0.0
94 NaN Denmark 56.2639 9.5018 1/22/20 0 0 0.0
95 NaN Djibouti 11.8251 42.5903 1/22/20 0 0 0.0
96 NaN Dominican Republic 18.7357 -70.1627 1/22/20 0 0 0.0
97 NaN Ecuador -1.8312 -78.1834 1/22/20 0 0 0.0
98 NaN Egypt 26.0000 30.0000 1/22/20 0 0 0.0
99 NaN El Salvador 13.7942 -88.8965 1/22/20 0 0 0.0
100 NaN Equatorial Guinea 1.5000 10.0000 1/22/20 0 0 0.0
101 NaN Eritrea 15.1794 39.7823 1/22/20 0 0 0.0
102 NaN Estonia 58.5953 25.0136 1/22/20 0 0 0.0
103 NaN Eswatini -26.5225 31.4659 1/22/20 0 0 0.0
In [18]:
all_data_df[all_data_df["Country/Region"].isna()]
Out[18]:
Province/State Country/Region Lat Long Date Confirmed Deaths Recovered
In [19]:
all_data_df[all_data_df["Confirmed"].isna()]
Out[19]:
Province/State Country/Region Lat Long Date Confirmed Deaths Recovered
In [20]:
all_data_df[all_data_df["Deaths"].isna()]
Out[20]:
Province/State Country/Region Lat Long Date Confirmed Deaths Recovered
In [21]:
all_data_df[all_data_df["Recovered"].isna()]
Out[21]:
Province/State Country/Region Lat Long Date Confirmed Deaths Recovered
17568 NaN Benin 9.307700 2.315800 3/30/20 6 0 NaN
17569 NaN Bhutan 27.514200 90.433600 3/30/20 4 0 NaN
17570 NaN Bolivia -16.290200 -63.588700 3/30/20 97 4 NaN
17571 NaN Bosnia and Herzegovina 43.915900 17.679100 3/30/20 368 10 NaN
17572 NaN Brazil -14.235000 -51.925300 3/30/20 4579 159 NaN
... ... ... ... ... ... ... ... ...
18571 NaN Botswana -22.328500 24.684900 4/2/20 4 1 NaN
18572 NaN Burundi -3.373100 29.918900 4/2/20 3 0 NaN
18573 NaN Sierra Leone 8.460555 -11.779889 4/2/20 2 0 NaN
18574 Bonaire, Sint Eustatius and Saba Netherlands 12.178400 -68.238500 4/2/20 2 0 NaN
18575 NaN Malawi -13.254308 34.301525 4/2/20 3 0 NaN

1008 rows × 8 columns

In [22]:
all_data_df[all_data_df["Recovered"].isna()].head(50)
Out[22]:
Province/State Country/Region Lat Long Date Confirmed Deaths Recovered
17568 NaN Benin 9.3077 2.3158 3/30/20 6 0 NaN
17569 NaN Bhutan 27.5142 90.4336 3/30/20 4 0 NaN
17570 NaN Bolivia -16.2902 -63.5887 3/30/20 97 4 NaN
17571 NaN Bosnia and Herzegovina 43.9159 17.6791 3/30/20 368 10 NaN
17572 NaN Brazil -14.2350 -51.9253 3/30/20 4579 159 NaN
17573 NaN Brunei 4.5353 114.7277 3/30/20 127 1 NaN
17574 NaN Bulgaria 42.7339 25.4858 3/30/20 359 8 NaN
17575 NaN Burkina Faso 12.2383 -1.5616 3/30/20 246 12 NaN
17576 NaN Cabo Verde 16.5388 -23.0418 3/30/20 6 1 NaN
17577 NaN Cambodia 11.5500 104.9167 3/30/20 107 0 NaN
17578 NaN Cameroon 3.8480 11.5021 3/30/20 139 6 NaN
17579 Alberta Canada 53.9333 -116.5765 3/30/20 661 3 NaN
17580 British Columbia Canada 49.2827 -123.1207 3/30/20 970 19 NaN
17581 Grand Princess Canada 37.6489 -122.6655 3/30/20 13 0 NaN
17582 Manitoba Canada 53.7609 -98.8139 3/30/20 96 1 NaN
17583 New Brunswick Canada 46.5653 -66.4619 3/30/20 68 0 NaN
17584 Newfoundland and Labrador Canada 53.1355 -57.6604 3/30/20 148 1 NaN
17585 Nova Scotia Canada 44.6820 -63.7443 3/30/20 127 0 NaN
17586 Ontario Canada 51.2538 -85.3232 3/30/20 1706 31 NaN
17587 Prince Edward Island Canada 46.5107 -63.4168 3/30/20 18 0 NaN
17588 Quebec Canada 52.9399 -73.5491 3/30/20 3430 22 NaN
17589 Saskatchewan Canada 52.9399 -106.4509 3/30/20 156 2 NaN
17590 NaN Central African Republic 6.6111 20.9394 3/30/20 3 0 NaN
17591 NaN Chad 15.4542 18.7322 3/30/20 5 0 NaN
17592 NaN Chile -35.6751 -71.5430 3/30/20 2449 8 NaN
17593 Anhui China 31.8257 117.2264 3/30/20 990 6 NaN
17594 Beijing China 40.1824 116.4142 3/30/20 577 8 NaN
17595 Chongqing China 30.0572 107.8740 3/30/20 579 6 NaN
17596 Fujian China 26.0789 117.9874 3/30/20 340 1 NaN
17597 Gansu China 37.8099 101.0583 3/30/20 138 2 NaN
17598 Guangdong China 23.3417 113.4244 3/30/20 1484 8 NaN
17599 Guangxi China 23.8298 108.7881 3/30/20 254 2 NaN
17600 Guizhou China 26.8154 106.8748 3/30/20 146 2 NaN
17601 Hainan China 19.1959 109.7453 3/30/20 168 6 NaN
17602 Hebei China 39.5490 116.1306 3/30/20 321 6 NaN
17603 Heilongjiang China 47.8620 127.7615 3/30/20 484 13 NaN
17604 Henan China 33.8820 113.6140 3/30/20 1276 22 NaN
17605 Hong Kong China 22.3000 114.2000 3/30/20 682 4 NaN
17606 Hubei China 30.9756 112.2707 3/30/20 67801 3186 NaN
17607 Hunan China 27.6104 111.7088 3/30/20 1018 4 NaN
17608 Inner Mongolia China 44.0935 113.9448 3/30/20 97 1 NaN
17609 Jiangsu China 32.9711 119.4550 3/30/20 645 0 NaN
17610 Jiangxi China 27.6140 115.7221 3/30/20 937 1 NaN
17611 Jilin China 43.6661 126.1923 3/30/20 98 1 NaN
17612 Liaoning China 41.2956 122.6085 3/30/20 136 2 NaN
17613 Macau China 22.1667 113.5500 3/30/20 38 0 NaN
17614 Ningxia China 37.2692 106.1655 3/30/20 75 0 NaN
17615 Qinghai China 35.7452 95.9956 3/30/20 18 0 NaN
17616 Shaanxi China 35.1917 108.8701 3/30/20 253 3 NaN
17617 Shandong China 36.3427 118.1498 3/30/20 773 7 NaN
In [23]:
all_data_df[["Deaths", "Recovered", "Confirmed"]] = all_data_df[["Deaths", "Recovered", "Confirmed"]].apply(lambda row: row.fillna(0))
In [24]:
all_data_df[all_data_df["Recovered"].isna()].head(50)
Out[24]:
Province/State Country/Region Lat Long Date Confirmed Deaths Recovered
In [25]:
all_data_df[all_data_df["Province/State"].isna()]["Country/Region"].unique()
Out[25]:
array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Austria',
       'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados',
       'Belarus', 'Belgium', 'Benin', 'Bhutan', 'Bolivia',
       'Bosnia and Herzegovina', 'Brazil', 'Brunei', 'Bulgaria',
       'Burkina Faso', 'Cabo Verde', 'Cambodia', 'Cameroon',
       'Central African Republic', 'Chad', 'Chile', 'Colombia',
       'Congo (Brazzaville)', 'Congo (Kinshasa)', 'Costa Rica',
       "Cote d'Ivoire", 'Croatia', 'Diamond Princess', 'Cuba', 'Cyprus',
       'Czechia', 'Denmark', 'Djibouti', 'Dominican Republic', 'Ecuador',
       'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon',
       'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Guatemala',
       'Guinea', 'Guyana', 'Haiti', 'Holy See', 'Honduras', 'Hungary',
       'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland',
       'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan',
       'Kenya', 'Korea, South', 'Kuwait', 'Kyrgyzstan', 'Latvia',
       'Lebanon', 'Liberia', 'Liechtenstein', 'Lithuania', 'Luxembourg',
       'Madagascar', 'Malaysia', 'Maldives', 'Malta', 'Mauritania',
       'Mauritius', 'Mexico', 'Moldova', 'Monaco', 'Mongolia',
       'Montenegro', 'Morocco', 'Namibia', 'Nepal', 'Netherlands',
       'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'North Macedonia',
       'Norway', 'Oman', 'Pakistan', 'Panama', 'Papua New Guinea',
       'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Qatar',
       'Romania', 'Russia', 'Rwanda', 'Saint Lucia',
       'Saint Vincent and the Grenadines', 'San Marino', 'Saudi Arabia',
       'Senegal', 'Serbia', 'Seychelles', 'Singapore', 'Slovakia',
       'Slovenia', 'Somalia', 'South Africa', 'Spain', 'Sri Lanka',
       'Sudan', 'Suriname', 'Sweden', 'Switzerland', 'Taiwan*',
       'Tanzania', 'Thailand', 'Togo', 'Trinidad and Tobago', 'Tunisia',
       'Turkey', 'Uganda', 'Ukraine', 'United Arab Emirates',
       'United Kingdom', 'Uruguay', 'US', 'Uzbekistan', 'Venezuela',
       'Vietnam', 'Zambia', 'Zimbabwe', 'Dominica', 'Grenada',
       'Mozambique', 'Syria', 'Timor-Leste', 'Belize', 'Laos', 'Libya',
       'West Bank and Gaza', 'Guinea-Bissau', 'Mali',
       'Saint Kitts and Nevis', 'Kosovo', 'Burma', 'MS Zaandam',
       'Botswana', 'Burundi', 'Sierra Leone', 'Malawi'], dtype=object)
In [26]:
all_data_df
Out[26]:
Province/State Country/Region Lat Long Date Confirmed Deaths Recovered
0 NaN Afghanistan 33.000000 65.000000 1/22/20 0 0 0.0
1 NaN Albania 41.153300 20.168300 1/22/20 0 0 0.0
2 NaN Algeria 28.033900 1.659600 1/22/20 0 0 0.0
3 NaN Andorra 42.506300 1.521800 1/22/20 0 0 0.0
4 NaN Angola -11.202700 17.873900 1/22/20 0 0 0.0
... ... ... ... ... ... ... ... ...
18571 NaN Botswana -22.328500 24.684900 4/2/20 4 1 0.0
18572 NaN Burundi -3.373100 29.918900 4/2/20 3 0 0.0
18573 NaN Sierra Leone 8.460555 -11.779889 4/2/20 2 0 0.0
18574 Bonaire, Sint Eustatius and Saba Netherlands 12.178400 -68.238500 4/2/20 2 0 0.0
18575 NaN Malawi -13.254308 34.301525 4/2/20 3 0 0.0

18576 rows × 8 columns

In [27]:
all_data_df.loc[:, "Date"] = all_data_df["Date"].apply(lambda s: pd.to_datetime(s).date())
latest_date = all_data_df["Date"].max()
In [28]:
all_data_df
Out[28]:
Province/State Country/Region Lat Long Date Confirmed Deaths Recovered
0 NaN Afghanistan 33.000000 65.000000 2020-01-22 0 0 0.0
1 NaN Albania 41.153300 20.168300 2020-01-22 0 0 0.0
2 NaN Algeria 28.033900 1.659600 2020-01-22 0 0 0.0
3 NaN Andorra 42.506300 1.521800 2020-01-22 0 0 0.0
4 NaN Angola -11.202700 17.873900 2020-01-22 0 0 0.0
... ... ... ... ... ... ... ... ...
18571 NaN Botswana -22.328500 24.684900 2020-04-02 4 1 0.0
18572 NaN Burundi -3.373100 29.918900 2020-04-02 3 0 0.0
18573 NaN Sierra Leone 8.460555 -11.779889 2020-04-02 2 0 0.0
18574 Bonaire, Sint Eustatius and Saba Netherlands 12.178400 -68.238500 2020-04-02 2 0 0.0
18575 NaN Malawi -13.254308 34.301525 2020-04-02 3 0 0.0

18576 rows × 8 columns

In [29]:
cro_data_df = all_data_df[(all_data_df["Country/Region"] == "Croatia")].sort_values('Date')
In [30]:
cro_data_df
Out[30]:
Province/State Country/Region Lat Long Date Confirmed Deaths Recovered
87 NaN Croatia 45.1 15.2 2020-01-22 0 0 0.0
345 NaN Croatia 45.1 15.2 2020-01-23 0 0 0.0
603 NaN Croatia 45.1 15.2 2020-01-24 0 0 0.0
861 NaN Croatia 45.1 15.2 2020-01-25 0 0 0.0
1119 NaN Croatia 45.1 15.2 2020-01-26 0 0 0.0
... ... ... ... ... ... ... ... ...
17373 NaN Croatia 45.1 15.2 2020-03-29 713 6 310.0
17631 NaN Croatia 45.1 15.2 2020-03-30 790 6 0.0
17889 NaN Croatia 45.1 15.2 2020-03-31 867 6 0.0
18147 NaN Croatia 45.1 15.2 2020-04-01 963 6 0.0
18405 NaN Croatia 45.1 15.2 2020-04-02 1011 7 0.0

72 rows × 8 columns

In [31]:
cro_data_df_conf = all_data_df[(all_data_df["Country/Region"] == "Croatia")&(all_data_df["Confirmed"] > 0)].sort_values('Date')
In [32]:
cro_data_df_conf
Out[32]:
Province/State Country/Region Lat Long Date Confirmed Deaths Recovered
8859 NaN Croatia 45.1 15.2 2020-02-25 1 0 0.0
9117 NaN Croatia 45.1 15.2 2020-02-26 3 0 0.0
9375 NaN Croatia 45.1 15.2 2020-02-27 3 0 0.0
9633 NaN Croatia 45.1 15.2 2020-02-28 5 0 0.0
9891 NaN Croatia 45.1 15.2 2020-02-29 6 0 149.0
10149 NaN Croatia 45.1 15.2 2020-03-01 7 0 0.0
10407 NaN Croatia 45.1 15.2 2020-03-02 7 0 0.0
10665 NaN Croatia 45.1 15.2 2020-03-03 9 0 2.0
10923 NaN Croatia 45.1 15.2 2020-03-04 10 0 0.0
11181 NaN Croatia 45.1 15.2 2020-03-05 10 0 0.0
11439 NaN Croatia 45.1 15.2 2020-03-06 11 0 0.0
11697 NaN Croatia 45.1 15.2 2020-03-07 12 0 0.0
11955 NaN Croatia 45.1 15.2 2020-03-08 12 0 0.0
12213 NaN Croatia 45.1 15.2 2020-03-09 12 0 0.0
12471 NaN Croatia 45.1 15.2 2020-03-10 14 0 0.0
12729 NaN Croatia 45.1 15.2 2020-03-11 19 0 349.0
12987 NaN Croatia 45.1 15.2 2020-03-12 19 0 71.0
13245 NaN Croatia 45.1 15.2 2020-03-13 32 0 1.0
13503 NaN Croatia 45.1 15.2 2020-03-14 38 0 0.0
13761 NaN Croatia 45.1 15.2 2020-03-15 49 0 0.0
14019 NaN Croatia 45.1 15.2 2020-03-16 57 0 1.0
14277 NaN Croatia 45.1 15.2 2020-03-17 65 0 20.0
14535 NaN Croatia 45.1 15.2 2020-03-18 81 0 0.0
14793 NaN Croatia 45.1 15.2 2020-03-19 105 1 0.0
15051 NaN Croatia 45.1 15.2 2020-03-20 128 1 0.0
15309 NaN Croatia 45.1 15.2 2020-03-21 206 1 22.0
15567 NaN Croatia 45.1 15.2 2020-03-22 254 1 7.0
15825 NaN Croatia 45.1 15.2 2020-03-23 315 1 0.0
16083 NaN Croatia 45.1 15.2 2020-03-24 382 1 135.0
16341 NaN Croatia 45.1 15.2 2020-03-25 442 1 0.0
16599 NaN Croatia 45.1 15.2 2020-03-26 495 3 30.0
16857 NaN Croatia 45.1 15.2 2020-03-27 586 3 0.0
17115 NaN Croatia 45.1 15.2 2020-03-28 657 5 10.0
17373 NaN Croatia 45.1 15.2 2020-03-29 713 6 310.0
17631 NaN Croatia 45.1 15.2 2020-03-30 790 6 0.0
17889 NaN Croatia 45.1 15.2 2020-03-31 867 6 0.0
18147 NaN Croatia 45.1 15.2 2020-04-01 963 6 0.0
18405 NaN Croatia 45.1 15.2 2020-04-02 1011 7 0.0
In [33]:
grid = sns.lineplot(data=cro_data_df_conf[(cro_data_df_conf["Confirmed"] > 0)], x="Date", y="Confirmed")
In [34]:
grid = sns.lineplot(data=cro_data_df_conf[(cro_data_df_conf["Confirmed"] > 0)], x="Date", y="Deaths")
In [35]:
grid = sns.lineplot(data=cro_data_df_conf[(cro_data_df_conf["Confirmed"] > 0)], x="Date", y="Recovered")
In [36]:
import datetime

cro_data_df_conf['DateWeek'] = cro_data_df_conf['Date'].apply(lambda date_row: date_row.isocalendar()[1])
cro_data_df_conf['DayOfYear'] = cro_data_df_conf['Date'].apply(lambda date_row: date_row.timetuple().tm_yday)
In [37]:
cro_data_df_conf
Out[37]:
Province/State Country/Region Lat Long Date Confirmed Deaths Recovered DateWeek DayOfYear
8859 NaN Croatia 45.1 15.2 2020-02-25 1 0 0.0 9 56
9117 NaN Croatia 45.1 15.2 2020-02-26 3 0 0.0 9 57
9375 NaN Croatia 45.1 15.2 2020-02-27 3 0 0.0 9 58
9633 NaN Croatia 45.1 15.2 2020-02-28 5 0 0.0 9 59
9891 NaN Croatia 45.1 15.2 2020-02-29 6 0 149.0 9 60
10149 NaN Croatia 45.1 15.2 2020-03-01 7 0 0.0 9 61
10407 NaN Croatia 45.1 15.2 2020-03-02 7 0 0.0 10 62
10665 NaN Croatia 45.1 15.2 2020-03-03 9 0 2.0 10 63
10923 NaN Croatia 45.1 15.2 2020-03-04 10 0 0.0 10 64
11181 NaN Croatia 45.1 15.2 2020-03-05 10 0 0.0 10 65
11439 NaN Croatia 45.1 15.2 2020-03-06 11 0 0.0 10 66
11697 NaN Croatia 45.1 15.2 2020-03-07 12 0 0.0 10 67
11955 NaN Croatia 45.1 15.2 2020-03-08 12 0 0.0 10 68
12213 NaN Croatia 45.1 15.2 2020-03-09 12 0 0.0 11 69
12471 NaN Croatia 45.1 15.2 2020-03-10 14 0 0.0 11 70
12729 NaN Croatia 45.1 15.2 2020-03-11 19 0 349.0 11 71
12987 NaN Croatia 45.1 15.2 2020-03-12 19 0 71.0 11 72
13245 NaN Croatia 45.1 15.2 2020-03-13 32 0 1.0 11 73
13503 NaN Croatia 45.1 15.2 2020-03-14 38 0 0.0 11 74
13761 NaN Croatia 45.1 15.2 2020-03-15 49 0 0.0 11 75
14019 NaN Croatia 45.1 15.2 2020-03-16 57 0 1.0 12 76
14277 NaN Croatia 45.1 15.2 2020-03-17 65 0 20.0 12 77
14535 NaN Croatia 45.1 15.2 2020-03-18 81 0 0.0 12 78
14793 NaN Croatia 45.1 15.2 2020-03-19 105 1 0.0 12 79
15051 NaN Croatia 45.1 15.2 2020-03-20 128 1 0.0 12 80
15309 NaN Croatia 45.1 15.2 2020-03-21 206 1 22.0 12 81
15567 NaN Croatia 45.1 15.2 2020-03-22 254 1 7.0 12 82
15825 NaN Croatia 45.1 15.2 2020-03-23 315 1 0.0 13 83
16083 NaN Croatia 45.1 15.2 2020-03-24 382 1 135.0 13 84
16341 NaN Croatia 45.1 15.2 2020-03-25 442 1 0.0 13 85
16599 NaN Croatia 45.1 15.2 2020-03-26 495 3 30.0 13 86
16857 NaN Croatia 45.1 15.2 2020-03-27 586 3 0.0 13 87
17115 NaN Croatia 45.1 15.2 2020-03-28 657 5 10.0 13 88
17373 NaN Croatia 45.1 15.2 2020-03-29 713 6 310.0 13 89
17631 NaN Croatia 45.1 15.2 2020-03-30 790 6 0.0 14 90
17889 NaN Croatia 45.1 15.2 2020-03-31 867 6 0.0 14 91
18147 NaN Croatia 45.1 15.2 2020-04-01 963 6 0.0 14 92
18405 NaN Croatia 45.1 15.2 2020-04-02 1011 7 0.0 14 93
In [38]:
import numpy, scipy
from scipy.optimize import curve_fit
import warnings

np.set_printoptions(suppress=True)
In [39]:
def show_logistic_regression(x_data, y_data, title='Data projection', initial_params=np.array([1.0, 1.0, 1.0])):

    def logistic_equation(x, a, b, c):
        return a / (1.0 + np.power(x / b, c))

    fitted_parameters, pcov = curve_fit(logistic_equation, x_data, y_data, initial_params)

    model_predictions = logistic_equation(x_data, *fitted_parameters) 

    absolute_error = model_predictions - y_data

    squared_error = np.square(absolute_error)
    mean_squared_error = np.mean(squared_error)
    root_mean_squared_error = np.sqrt(mean_squared_error)
    r_squared = 1.0 - (np.var(absolute_error) / np.var(y_data))

    print('Parameters:', fitted_parameters)
    print('RMSE:', root_mean_squared_error)
    print('R-squared:', r_squared)

    print()


    def scatter_plot():
        f = plt.figure(figsize=(16,10), dpi=100)
        axes = f.add_subplot(111)

        # Plot the actual data
        axes.plot(x_data, y_data, 'X')

        # Create model data
        x_model = numpy.linspace(min(x_data), max(x_data) + 40)
        y_model = logistic_equation(x_model, *fitted_parameters)

        # Plot model data
        axes.plot(x_model, y_model)

        axes.set_xlabel('Days in year')
        axes.set_ylabel('Confirmed cases')

        axes.set_title(title)

        axes.axvline(datetime.date.today().timetuple().tm_yday, 0, 1, color='red', linewidth=4)
        axes.axhline(fitted_parameters[0], 0, 1, color='blue', linewidth=4)


    scatter_plot()
In [40]:
ax = sns.barplot(data=cro_data_df_conf, x="DayOfYear", y="Confirmed", ci="sd", palette="Oranges_d")
ax.set_title('Croatia confirmed cases')
Out[40]:
Text(0.5, 1.0, 'Croatia confirmed cases')
In [41]:
fit_data_df = cro_data_df_conf

x_data = fit_data_df['DayOfYear'].values
y_data = fit_data_df['Confirmed'].values

show_logistic_regression(x_data, y_data, title='Croatia projection')
Parameters: [1331.96287923   88.12660728  -20.69658637]
RMSE: 11.365864103701535
R-squared: 0.9986600578415235

In [42]:
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import math

mu = 100
variance = 10
sigma = math.sqrt(variance)
In [43]:
def show_gaussian_regression(x_data, y_data, title='Data projection', initial_params=np.array([1.0, 1.0, 1.0])):

    def gaussian_equation(x, size, mu, sig):
        return size * np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.)))

    fitted_parameters, pcov = curve_fit(gaussian_equation, x_data, y_data, initial_params, maxfev=10000)

    model_predictions = gaussian_equation(x_data, *fitted_parameters) 

    absolute_error = model_predictions - y_data

    squared_error = np.square(absolute_error)
    mean_squared_error = np.mean(squared_error)
    root_mean_squared_error = np.sqrt(mean_squared_error)
    r_squared = 1.0 - (np.var(absolute_error) / np.var(y_data))

    print('Parameters:', fitted_parameters)
    print('RMSE:', root_mean_squared_error)
    print('R-squared:', r_squared)

    print()


    def scatter_plot():
        f = plt.figure(figsize=(16,10), dpi=100)
        axes = f.add_subplot(111)

        # Plot the actual data
        axes.plot(x_data, y_data, 'X')

        # Create model data
        x_model = numpy.linspace(min(x_data), max(x_data) + 20)
        y_model = gaussian_equation(x_model, *fitted_parameters)

        # Plot model data
        axes.plot(x_model, y_model)

        axes.set_xlabel('Days in year')
        axes.set_ylabel('Confirmed cases')

        axes.set_title(title)

        #axes.axhline(fitted_parameters[0], 0, 1, color='blue', linewidth=4)


    scatter_plot()
In [44]:
import pymc3 as pm
print('Running on PyMC3 v{}'.format(pm.__version__))
Running on PyMC3 v3.8
In [45]:
fit_df = cro_data_df_conf

fit_df['Diff'] = fit_df['Confirmed'].diff()
fit_df['GrowthFactor'] = fit_df['Diff'] / fit_df['Diff'].shift(-1)

sns.barplot(data=fit_df, x="DayOfYear", y="Diff", palette="Blues_d")
Out[45]:
<matplotlib.axes._subplots.AxesSubplot at 0x22985eaa548>
In [46]:
x = fit_df['DayOfYear'].fillna(0).values
y = fit_df['Diff'].fillna(0).values
In [47]:
show_gaussian_regression(x_data, y_data, initial_params=np.array([1000, 90.0, 10.0]))
Parameters: [1110.77882234   96.82896293    8.55785011]
RMSE: 11.609162295966911
R-squared: 0.9986353699797148

In [48]:
n = len(x) # The number of data points
X = x[:, None] # The inputs to the GP, they must be arranged as a column vector

## Plot the data and the unobserved latent function
fig = plt.figure(figsize=(16,10), dpi=100)
ax = fig.gca()

ax.plot(X, y, 'ok', ms=3, alpha=0.5, label="Data")
ax.set_xlabel("Days in year")
ax.set_ylabel("Confirmed");
plt.legend()
Out[48]:
<matplotlib.legend.Legend at 0x22986075848>
In [49]:
import theano.tensor as tt

def warp_func(x, a, b, c):
    return 1.0 + x + (a * tt.tanh(b * (x - c)))

with pm.Model() as model:
 
     = pm.Gamma("ℓ", alpha=2, beta=1)
    η = pm.HalfCauchy("η", beta=5)
    
    cov_exp = η**2 * pm.gp.cov.Matern52(1, )
    
    gp = pm.gp.Marginal(cov_func=cov_exp)

    σ = pm.HalfCauchy("σ", beta=5)
    y_ = gp.marginal_likelihood("y", X=X, y=y, noise=σ)

    mp = pm.find_MAP()
C:\Users\matij\Anaconda3\envs\covid-analysis\lib\site-packages\theano\tensor\basic.py:6611: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

C:\Users\matij\Anaconda3\envs\covid-analysis\lib\site-packages\theano\tensor\basic.py:6611: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

C:\Users\matij\Anaconda3\envs\covid-analysis\lib\site-packages\theano\tensor\basic.py:6611: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

C:\Users\matij\Anaconda3\envs\covid-analysis\lib\site-packages\theano\tensor\basic.py:6611: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

C:\Users\matij\Anaconda3\envs\covid-analysis\lib\site-packages\theano\tensor\basic.py:6611: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

C:\Users\matij\Anaconda3\envs\covid-analysis\lib\site-packages\theano\tensor\basic.py:6611: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

  0%|                                                                                         | 0/5000 [00:00<?, ?it/s]C:\Users\matij\Anaconda3\envs\covid-analysis\lib\site-packages\theano\tensor\basic.py:6611: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

logp = -170.86, ||grad|| = 0.0214: 100%|██████████████████████████████████████████████| 14/14 [00:00<00:00, 759.88it/s]
In [50]:
# new values, we convert into column vector
X_new = np.linspace(50, 120, 40)[:,None]

# add the GP conditional to the model, given the new X values
with model:
    f_pred = gp.conditional("f_pred", X_new)

# To use the MAP values, you can just replace the trace with a length-1 list with `mp`
with model:
    pred_samples = pm.sample_posterior_predictive([mp], vars=[f_pred], samples=1000)
C:\Users\matij\Anaconda3\envs\covid-analysis\lib\site-packages\theano\tensor\basic.py:6611: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

  0%|                                                                                         | 0/1000 [00:00<?, ?it/s]C:\Users\matij\Anaconda3\envs\covid-analysis\lib\site-packages\theano\tensor\basic.py:6611: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

C:\Users\matij\Anaconda3\envs\covid-analysis\lib\site-packages\theano\tensor\basic.py:6611: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:13<00:00, 72.22it/s]
In [51]:
# plot the results
fig = plt.figure(figsize=(16,10), dpi=100)
ax = fig.gca()

# plot the samples from the gp posterior with samples and shading
from pymc3.gp.util import plot_gp_dist
plot_gp_dist(ax, pred_samples["f_pred"], X_new)

# plot the data and the true latent function
plt.plot(X, y, 'ok', ms=3, alpha=0.5, label="Observed data")

# axis labels and title
plt.xlabel("X")
plt.title("Posterior distribution over $f(x)$ at the observed values")
plt.legend()
Out[51]:
<matplotlib.legend.Legend at 0x2298be05ec8>