import pandas as pd
import numpy as np
import seaborn as sns


df=pd.read_excel('health_data_nulls.xlsx')
df1 = pd.read_excel('health_data_oob_values.xlsx')
df


df1.head(20)


sns.pairplot(df)

<seaborn.axisgrid.PairGrid at 0x1abfedb0688>


sns.pairplot(df1)

<seaborn.axisgrid.PairGrid at 0x1ab80f06548>


df1.Alcohol[df1.Alcohol==999] = np.nan
df1.head(15)


df['Age'].isnull().sum()

104


df['Income'].isnull().sum()

31


df['Age'].mean()

44.825892857142854


df['AgeImpMean']=df['Age'].fillna(df['Age'].mean())
df.head(15)


df['AgeImpMed']=df['Age'].fillna(df['Age'].median())
df['AgeImpMode']=df['Age'].fillna(df['Age'].mode())
df.head(15)


sns.regplot(x="Age", y="Blood Pressure", data=df)

<AxesSubplot:xlabel='Age', ylabel='Blood Pressure'>


sns.regplot(x="AgeImpMean", y="Blood Pressure", data=df)

<AxesSubplot:xlabel='AgeImpMean', ylabel='Blood Pressure'>


df['AgeNA']=df['Age'].isna()
df.head(15)


sns.lmplot(x="AgeImpMean", y="Blood Pressure", data=df, hue='AgeNA')

<seaborn.axisgrid.FacetGrid at 0x1abfecf0908>


from scipy import stats


corr, _ = stats.pearsonr(df['AgeImpMean'], df['Blood Pressure'])
corr

0.6449459380613165


import statsmodels.api as sm


correlation = df['AgeImpMean'].corr(df['Blood Pressure'])
correlation

0.6449459380613166


correlation = df['Age'].corr(df['Income'])
correlation

0.4989889602448255


df.dropna()


from statsmodels.formula.api import ols #or glm for logistic


df2=df


df2['BP']=df['Blood Pressure']


model_lm = ols(formula = 'BP ~ Age',
               data = df2).fit()
print(model_lm.params)

Intercept   -6.063997
Age          1.170684
dtype: float64


df2['Predicted']=df2['Age']*1.170684+(-6.063997)


df2['Predicted'].head()

0    65.347727
1    58.323623
2    55.982255
3    29.056523
4    68.859779
Name: Predicted, dtype: float64

	Person	Age	Income	Alcohol	Exercise	Smoke	Blood Pressure
0	1	61.0	268300.0	41.0	NaN	3.0	62
1	2	55.0	122200.0	51.0	7.0	56.0	53
2	3	53.0	82100.0	37.0	0.0	55.0	42
3	4	30.0	101400.0	41.0	20.0	61.0	48
4	5	64.0	181100.0	NaN	0.0	70.0	81
...	...	...	...	...	...	...	...
995	996	50.0	141300.0	9.0	11.0	36.0	43
996	997	40.0	155700.0	0.0	NaN	3.0	29
997	998	36.0	84700.0	42.0	47.0	21.0	21
998	999	51.0	124500.0	63.0	40.0	1.0	25
999	1000	28.0	241200.0	NaN	90.0	0.0	42

	Person	Age	Income	Alcohol	Exercise	Smoke	Blood Pressure
0	1	61	268300	41.0	999	3	62
1	2	55	122200	51.0	7	56	53
2	3	53	82100	37.0	0	55	42
3	4	30	101400	41.0	20	61	48
4	5	64	181100	NaN	0	70	81
5	6	45	156600	60.0	35	999	80
6	7	56	160400	55.0	999	59	63
7	8	999	78800	31.0	12	43	31
8	9	59	233500	25.0	15	33	66
9	10	44	50400	64.0	0	85	54
10	11	999	224400	69.0	21	55	78
11	12	42	175000	22.0	26	21	43
12	13	63	255900	46.0	32	24	68
13	14	30	70300	53.0	999	79	56
14	15	52	229500	74.0	56	69	85

	Person	Age	Income	Alcohol	Exercise	Smoke	Blood Pressure	AgeImpMean
0	1	61.0	268300.0	41.0	NaN	3.0	62	61.000000
1	2	55.0	122200.0	51.0	7.0	56.0	53	55.000000
2	3	53.0	82100.0	37.0	0.0	55.0	42	53.000000
3	4	30.0	101400.0	41.0	20.0	61.0	48	30.000000
4	5	64.0	181100.0	NaN	0.0	70.0	81	64.000000
5	6	45.0	156600.0	60.0	35.0	NaN	80	45.000000
6	7	56.0	160400.0	55.0	NaN	59.0	63	56.000000
7	8	NaN	78800.0	31.0	12.0	43.0	31	44.825893
8	9	59.0	233500.0	25.0	15.0	33.0	66	59.000000
9	10	44.0	50400.0	64.0	0.0	85.0	54	44.000000
10	11	NaN	224400.0	69.0	21.0	55.0	78	44.825893
11	12	42.0	175000.0	22.0	26.0	21.0	43	42.000000
12	13	63.0	255900.0	46.0	32.0	24.0	68	63.000000
13	14	30.0	70300.0	53.0	NaN	79.0	56	30.000000
14	15	52.0	229500.0	74.0	56.0	69.0	85	52.000000

	Person	Age	Income	Alcohol	Exercise	Smoke	Blood Pressure	AgeImpMean	AgeImpMed	AgeImpMode
0	1	61.0	268300.0	41.0	NaN	3.0	62	61.000000	61.0	61.0
1	2	55.0	122200.0	51.0	7.0	56.0	53	55.000000	55.0	55.0
2	3	53.0	82100.0	37.0	0.0	55.0	42	53.000000	53.0	53.0
3	4	30.0	101400.0	41.0	20.0	61.0	48	30.000000	30.0	30.0
4	5	64.0	181100.0	NaN	0.0	70.0	81	64.000000	64.0	64.0
5	6	45.0	156600.0	60.0	35.0	NaN	80	45.000000	45.0	45.0
6	7	56.0	160400.0	55.0	NaN	59.0	63	56.000000	56.0	56.0
7	8	NaN	78800.0	31.0	12.0	43.0	31	44.825893	45.0	NaN
8	9	59.0	233500.0	25.0	15.0	33.0	66	59.000000	59.0	59.0
9	10	44.0	50400.0	64.0	0.0	85.0	54	44.000000	44.0	44.0
10	11	NaN	224400.0	69.0	21.0	55.0	78	44.825893	45.0	NaN
11	12	42.0	175000.0	22.0	26.0	21.0	43	42.000000	42.0	42.0
12	13	63.0	255900.0	46.0	32.0	24.0	68	63.000000	63.0	63.0
13	14	30.0	70300.0	53.0	NaN	79.0	56	30.000000	30.0	30.0
14	15	52.0	229500.0	74.0	56.0	69.0	85	52.000000	52.0	52.0

	Person	Age	Income	Alcohol	Exercise	Smoke	Blood Pressure	AgeImpMean	AgeImpMed	AgeImpMode	AgeNA
0	1	61.0	268300.0	41.0	NaN	3.0	62	61.000000	61.0	61.0	False
1	2	55.0	122200.0	51.0	7.0	56.0	53	55.000000	55.0	55.0	False
2	3	53.0	82100.0	37.0	0.0	55.0	42	53.000000	53.0	53.0	False
3	4	30.0	101400.0	41.0	20.0	61.0	48	30.000000	30.0	30.0	False
4	5	64.0	181100.0	NaN	0.0	70.0	81	64.000000	64.0	64.0	False
5	6	45.0	156600.0	60.0	35.0	NaN	80	45.000000	45.0	45.0	False
6	7	56.0	160400.0	55.0	NaN	59.0	63	56.000000	56.0	56.0	False
7	8	NaN	78800.0	31.0	12.0	43.0	31	44.825893	45.0	NaN	True
8	9	59.0	233500.0	25.0	15.0	33.0	66	59.000000	59.0	59.0	False
9	10	44.0	50400.0	64.0	0.0	85.0	54	44.000000	44.0	44.0	False
10	11	NaN	224400.0	69.0	21.0	55.0	78	44.825893	45.0	NaN	True
11	12	42.0	175000.0	22.0	26.0	21.0	43	42.000000	42.0	42.0	False
12	13	63.0	255900.0	46.0	32.0	24.0	68	63.000000	63.0	63.0	False
13	14	30.0	70300.0	53.0	NaN	79.0	56	30.000000	30.0	30.0	False
14	15	52.0	229500.0	74.0	56.0	69.0	85	52.000000	52.0	52.0	False