In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
In [26]:
df=pd.read_excel('health_data_nulls.xlsx')
df1 = pd.read_excel('health_data_oob_values.xlsx')
df
Out[26]:
Person Age Income Alcohol Exercise Smoke Blood Pressure
0 1 61.0 268300.0 41.0 NaN 3.0 62
1 2 55.0 122200.0 51.0 7.0 56.0 53
2 3 53.0 82100.0 37.0 0.0 55.0 42
3 4 30.0 101400.0 41.0 20.0 61.0 48
4 5 64.0 181100.0 NaN 0.0 70.0 81
... ... ... ... ... ... ... ...
995 996 50.0 141300.0 9.0 11.0 36.0 43
996 997 40.0 155700.0 0.0 NaN 3.0 29
997 998 36.0 84700.0 42.0 47.0 21.0 21
998 999 51.0 124500.0 63.0 40.0 1.0 25
999 1000 28.0 241200.0 NaN 90.0 0.0 42

1000 rows × 7 columns

In [4]:
df1.head(20)
Out[4]:
Person Age Income Alcohol Exercise Smoke Blood Pressure
0 1 61 268300 41 999 3 62
1 2 55 122200 51 7 56 53
2 3 53 82100 37 0 55 42
3 4 30 101400 41 20 61 48
4 5 64 181100 999 0 70 81
5 6 45 156600 60 35 999 80
6 7 56 160400 55 999 59 63
7 8 999 78800 31 12 43 31
8 9 59 233500 25 15 33 66
9 10 44 50400 64 0 85 54
10 11 999 224400 69 21 55 78
11 12 42 175000 22 26 21 43
12 13 63 255900 46 32 24 68
13 14 30 70300 53 999 79 56
14 15 52 229500 74 56 69 85
15 16 54 188600 62 51 5 39
16 17 42 265000 41 88 0 47
17 18 36 65400 41 14 59 34
18 19 56 81200 64 0 71 53
19 20 33 67200 46 9 54 33
In [5]:
sns.pairplot(df)
Out[5]:
<seaborn.axisgrid.PairGrid at 0x1abfedb0688>
In [6]:
sns.pairplot(df1)
Out[6]:
<seaborn.axisgrid.PairGrid at 0x1ab80f06548>
In [7]:
df1.Alcohol[df1.Alcohol==999] = np.nan
df1.head(15)
Out[7]:
Person Age Income Alcohol Exercise Smoke Blood Pressure
0 1 61 268300 41.0 999 3 62
1 2 55 122200 51.0 7 56 53
2 3 53 82100 37.0 0 55 42
3 4 30 101400 41.0 20 61 48
4 5 64 181100 NaN 0 70 81
5 6 45 156600 60.0 35 999 80
6 7 56 160400 55.0 999 59 63
7 8 999 78800 31.0 12 43 31
8 9 59 233500 25.0 15 33 66
9 10 44 50400 64.0 0 85 54
10 11 999 224400 69.0 21 55 78
11 12 42 175000 22.0 26 21 43
12 13 63 255900 46.0 32 24 68
13 14 30 70300 53.0 999 79 56
14 15 52 229500 74.0 56 69 85
In [8]:
df['Age'].isnull().sum()
Out[8]:
104
In [10]:
df['Income'].isnull().sum()
Out[10]:
31
In [11]:
df['Age'].mean()
Out[11]:
44.825892857142854
In [12]:
df['AgeImpMean']=df['Age'].fillna(df['Age'].mean())
df.head(15)
Out[12]:
Person Age Income Alcohol Exercise Smoke Blood Pressure AgeImpMean
0 1 61.0 268300.0 41.0 NaN 3.0 62 61.000000
1 2 55.0 122200.0 51.0 7.0 56.0 53 55.000000
2 3 53.0 82100.0 37.0 0.0 55.0 42 53.000000
3 4 30.0 101400.0 41.0 20.0 61.0 48 30.000000
4 5 64.0 181100.0 NaN 0.0 70.0 81 64.000000
5 6 45.0 156600.0 60.0 35.0 NaN 80 45.000000
6 7 56.0 160400.0 55.0 NaN 59.0 63 56.000000
7 8 NaN 78800.0 31.0 12.0 43.0 31 44.825893
8 9 59.0 233500.0 25.0 15.0 33.0 66 59.000000
9 10 44.0 50400.0 64.0 0.0 85.0 54 44.000000
10 11 NaN 224400.0 69.0 21.0 55.0 78 44.825893
11 12 42.0 175000.0 22.0 26.0 21.0 43 42.000000
12 13 63.0 255900.0 46.0 32.0 24.0 68 63.000000
13 14 30.0 70300.0 53.0 NaN 79.0 56 30.000000
14 15 52.0 229500.0 74.0 56.0 69.0 85 52.000000
In [13]:
df['AgeImpMed']=df['Age'].fillna(df['Age'].median())
df['AgeImpMode']=df['Age'].fillna(df['Age'].mode())
df.head(15)
Out[13]:
Person Age Income Alcohol Exercise Smoke Blood Pressure AgeImpMean AgeImpMed AgeImpMode
0 1 61.0 268300.0 41.0 NaN 3.0 62 61.000000 61.0 61.0
1 2 55.0 122200.0 51.0 7.0 56.0 53 55.000000 55.0 55.0
2 3 53.0 82100.0 37.0 0.0 55.0 42 53.000000 53.0 53.0
3 4 30.0 101400.0 41.0 20.0 61.0 48 30.000000 30.0 30.0
4 5 64.0 181100.0 NaN 0.0 70.0 81 64.000000 64.0 64.0
5 6 45.0 156600.0 60.0 35.0 NaN 80 45.000000 45.0 45.0
6 7 56.0 160400.0 55.0 NaN 59.0 63 56.000000 56.0 56.0
7 8 NaN 78800.0 31.0 12.0 43.0 31 44.825893 45.0 NaN
8 9 59.0 233500.0 25.0 15.0 33.0 66 59.000000 59.0 59.0
9 10 44.0 50400.0 64.0 0.0 85.0 54 44.000000 44.0 44.0
10 11 NaN 224400.0 69.0 21.0 55.0 78 44.825893 45.0 NaN
11 12 42.0 175000.0 22.0 26.0 21.0 43 42.000000 42.0 42.0
12 13 63.0 255900.0 46.0 32.0 24.0 68 63.000000 63.0 63.0
13 14 30.0 70300.0 53.0 NaN 79.0 56 30.000000 30.0 30.0
14 15 52.0 229500.0 74.0 56.0 69.0 85 52.000000 52.0 52.0
In [14]:
sns.regplot(x="Age", y="Blood Pressure", data=df)
Out[14]:
<AxesSubplot:xlabel='Age', ylabel='Blood Pressure'>
In [15]:
sns.regplot(x="AgeImpMean", y="Blood Pressure", data=df)
Out[15]:
<AxesSubplot:xlabel='AgeImpMean', ylabel='Blood Pressure'>
In [16]:
df['AgeNA']=df['Age'].isna()
df.head(15)
Out[16]:
Person Age Income Alcohol Exercise Smoke Blood Pressure AgeImpMean AgeImpMed AgeImpMode AgeNA
0 1 61.0 268300.0 41.0 NaN 3.0 62 61.000000 61.0 61.0 False
1 2 55.0 122200.0 51.0 7.0 56.0 53 55.000000 55.0 55.0 False
2 3 53.0 82100.0 37.0 0.0 55.0 42 53.000000 53.0 53.0 False
3 4 30.0 101400.0 41.0 20.0 61.0 48 30.000000 30.0 30.0 False
4 5 64.0 181100.0 NaN 0.0 70.0 81 64.000000 64.0 64.0 False
5 6 45.0 156600.0 60.0 35.0 NaN 80 45.000000 45.0 45.0 False
6 7 56.0 160400.0 55.0 NaN 59.0 63 56.000000 56.0 56.0 False
7 8 NaN 78800.0 31.0 12.0 43.0 31 44.825893 45.0 NaN True
8 9 59.0 233500.0 25.0 15.0 33.0 66 59.000000 59.0 59.0 False
9 10 44.0 50400.0 64.0 0.0 85.0 54 44.000000 44.0 44.0 False
10 11 NaN 224400.0 69.0 21.0 55.0 78 44.825893 45.0 NaN True
11 12 42.0 175000.0 22.0 26.0 21.0 43 42.000000 42.0 42.0 False
12 13 63.0 255900.0 46.0 32.0 24.0 68 63.000000 63.0 63.0 False
13 14 30.0 70300.0 53.0 NaN 79.0 56 30.000000 30.0 30.0 False
14 15 52.0 229500.0 74.0 56.0 69.0 85 52.000000 52.0 52.0 False
In [18]:
sns.lmplot(x="AgeImpMean", y="Blood Pressure", data=df, hue='AgeNA')
Out[18]:
<seaborn.axisgrid.FacetGrid at 0x1abfecf0908>
In [19]:
from scipy import stats
In [20]:
corr, _ = stats.pearsonr(df['AgeImpMean'], df['Blood Pressure'])
corr
Out[20]:
0.6449459380613165
In [22]:
import statsmodels.api as sm
In [23]:
correlation = df['AgeImpMean'].corr(df['Blood Pressure'])
correlation
Out[23]:
0.6449459380613166
In [24]:
correlation = df['Age'].corr(df['Income'])
correlation
Out[24]:
0.4989889602448255
In [25]:
df.dropna()
Out[25]:
Person Age Income Alcohol Exercise Smoke Blood Pressure AgeImpMean AgeImpMed AgeImpMode AgeNA
1 2 55.0 122200.0 51.0 7.0 56.0 53 55.0 55.0 55.0 False
2 3 53.0 82100.0 37.0 0.0 55.0 42 53.0 53.0 53.0 False
3 4 30.0 101400.0 41.0 20.0 61.0 48 30.0 30.0 30.0 False
8 9 59.0 233500.0 25.0 15.0 33.0 66 59.0 59.0 59.0 False
9 10 44.0 50400.0 64.0 0.0 85.0 54 44.0 44.0 44.0 False
... ... ... ... ... ... ... ... ... ... ... ...
991 992 44.0 138300.0 23.0 27.0 12.0 31 44.0 44.0 44.0 False
994 995 30.0 63000.0 34.0 28.0 33.0 18 30.0 30.0 30.0 False
995 996 50.0 141300.0 9.0 11.0 36.0 43 50.0 50.0 50.0 False
997 998 36.0 84700.0 42.0 47.0 21.0 21 36.0 36.0 36.0 False
998 999 51.0 124500.0 63.0 40.0 1.0 25 51.0 51.0 51.0 False

638 rows × 11 columns

In [34]:
from statsmodels.formula.api import ols #or glm for logistic
In [36]:
df2=df
In [38]:
df2['BP']=df['Blood Pressure']
In [43]:
model_lm = ols(formula = 'BP ~ Age',
               data = df2).fit()
print(model_lm.params)
Intercept   -6.063997
Age          1.170684
dtype: float64
In [44]:
df2['Predicted']=df2['Age']*1.170684+(-6.063997)
In [45]:
df2['Predicted'].head()
Out[45]:
0    65.347727
1    58.323623
2    55.982255
3    29.056523
4    68.859779
Name: Predicted, dtype: float64
In [ ]: