In [1]:
import pandas as pd
import numpy as np
df = pd.read_excel('survey_data2.xlsx')
df=df[['Age','Gender','State','Children','Salary','Agree or Not']]
df = pd.concat([df,pd.get_dummies(df['Age'], prefix='_', drop_first=True)],axis=1)
df.drop(['Age'],axis=1, inplace=True)
df = pd.concat([df,pd.get_dummies(df['State'], prefix='State', drop_first=True)],axis=1)
df.drop(['State'],axis=1, inplace=True)
df['Gender']=df['Gender']-1
df_copy = df.copy()
train_set = df_copy.sample(frac=0.80, random_state=0) 
test_set = df_copy.drop(train_set.index)
test_set_labels = test_set.pop('Agree or Not')
train_set_labels = train_set.pop('Agree or Not')
In [2]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(train_set)
In [6]:
mlp = MLPClassifier(hidden_layer_sizes=(17,17,17,17),max_iter=1000) #13 is the number of nodes, with 3 layers
mlp.fit(scaled_features,train_set_labels)
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(17, 17, 17, 17), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
predictions = mlp.predict(train_set) #to predict with the test set you'd also have to scale it
C:\Users\Betsy\Anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
In [7]:
from sklearn.metrics import classification_report,confusion_matrix
In [8]:
print(confusion_matrix(train_set_labels, predictions))
[[184   0]
 [135   0]]
In [9]:
print(classification_report(train_set_labels, predictions))
              precision    recall  f1-score   support

           0       0.58      1.00      0.73       184
           1       0.00      0.00      0.00       135

    accuracy                           0.58       319
   macro avg       0.29      0.50      0.37       319
weighted avg       0.33      0.58      0.42       319

C:\Users\Betsy\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
In [10]:
import re
In [11]:
text="The rain in Spain is mainly on the plain."
x=re.findall("ai",text)
x
Out[11]:
['ai', 'ai', 'ai', 'ai']
In [12]:
x=re.split("\s", text)
x
Out[12]:
['The', 'rain', 'in', 'Spain', 'is', 'mainly', 'on', 'the', 'plain.']
In [13]:
x=re.split("\s", text, 1)
x
Out[13]:
['The', 'rain in Spain is mainly on the plain.']
In [14]:
x=re.sub('\s', "-", text)
x
Out[14]:
'The-rain-in-Spain-is-mainly-on-the-plain.'
In [15]:
x=re.search(r"\bS\w+", text)
x.span()
Out[15]:
(12, 17)
In [17]:
print(x.string)
The rain in Spain is mainly on the plain.
In [18]:
print(x.group())
Spain
In [20]:
x=re.findall('[mat]', text)
x
Out[20]:
['a', 'a', 'm', 'a', 't', 'a']
In [21]:
x=re.findall('ain+', text)
x
Out[21]:
['ain', 'ain', 'ain', 'ain']
In [22]:
#https://www.w3schools.com/python/python_regex.asp more code keys here
In [23]:
crimedf = pd.read_excel('crime_data.xlsx')
transactdf=pd.read_excel('transaction_data.xlsx')
crimedf.head()
Out[23]:
Year Population Violent crime total Murder and nonnegligent manslaughter Forcible rape Robbery Aggravated assault Property crime total Burglary Larceny-theft Motor vehicle theft
0 1960 179323175 288460 9110 17190 107840 154320 3095700 912100 1855400 328200
1 1961 182992000 289390 8740 17220 106670 156760 3198600 949600 1913000 336000
2 1962 185771000 301510 8530 17550 110860 164570 3450700 994300 2089600 366800
3 1963 188483000 316970 8640 17650 116470 174210 3792500 1086400 2297800 408300
4 1964 191141000 364220 9360 21420 130390 203050 4200400 1213200 2514400 472800
In [24]:
transactdf.head()
Out[24]:
Transaction Purchase Date Customer ID Gender Marital Status Homeowner Children Annual Income City State or Province Country Product Family Product Department Product Category Units Sold Revenue
0 1 2014-12-18 7223 F S Y 2 $30K - $50K Los Angeles CA USA Food Snack Foods Snack Foods 5 27.38
1 2 2014-12-20 7841 M M Y 5 $70K - $90K Los Angeles CA USA Food Produce Vegetables 5 14.90
2 3 2014-12-21 8374 F M N 2 $50K - $70K Bremerton WA USA Food Snack Foods Snack Foods 3 5.52
3 4 2014-12-21 9619 M M Y 3 $30K - $50K Portland OR USA Food Snacks Candy 4 4.44
4 5 2014-12-22 1900 F S Y 3 $130K - $150K Beverly Hills CA USA Drink Beverages Carbonated Beverages 4 14.00
In [25]:
crimedf.dtypes
Out[25]:
Year                                    int64
Population                              int64
Violent crime total                     int64
Murder and nonnegligent manslaughter    int64
Forcible rape                           int64
Robbery                                 int64
Aggravated assault                      int64
Property crime total                    int64
Burglary                                int64
Larceny-theft                           int64
Motor vehicle theft                     int64
dtype: object
In [26]:
transactdf.dtypes
Out[26]:
Transaction                    int64
Purchase Date         datetime64[ns]
Customer ID                    int64
Gender                        object
Marital Status                object
Homeowner                     object
Children                       int64
Annual Income                 object
City                          object
State or Province             object
Country                       object
Product Family                object
Product Department            object
Product Category              object
Units Sold                     int64
Revenue                      float64
dtype: object
In [27]:
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.ar_model import AutoReg, ar_select_order
from statsmodels.tsa.api import acf, pacf, graphics
In [28]:
fig,ax = plt.subplots()
ax = crimedf.plot(ax=ax)
In [29]:
sns.set_style('darkgrid')
pd.plotting.register_matplotlib_converters()
sns.mpl.rc('figure', figsize=(16,6))
In [30]:
fig,ax = plt.subplots()
ax = crimedf.plot(ax=ax)
In [31]:
crime_rate=crimedf.copy()
crime_rate['Violent crime rate'] = crimedf['Violent crime total']/crimedf['Population']
crime_rate['Year']=crimedf['Year']
fig, ax = plt.subplots()
ax = crime_rate['Violent crime rate'].plot(ax=ax)
In [32]:
fig,ax = plt.subplots()
ax = transactdf.plot(ax=ax)
In [33]:
fig,ax = plt.subplots()
ax = transactdf['Revenue'].plot(ax=ax)
In [34]:
transact_small=transactdf.copy()
transact_small.drop(['Transaction'],axis=1, inplace=True)
transact_small.drop(['Customer ID'],axis=1, inplace=True)
In [35]:
import matplotlib.pylab as plt
pd.plotting.lag_plot(transact_small['Revenue'])
Out[35]:
<AxesSubplot:xlabel='y(t)', ylabel='y(t + 1)'>
In [36]:
pd.plotting.lag_plot(crime_rate['Violent crime rate'])
Out[36]:
<AxesSubplot:xlabel='y(t)', ylabel='y(t + 1)'>
In [37]:
pd.plotting.autocorrelation_plot(crime_rate['Violent crime rate'])
Out[37]:
<AxesSubplot:xlabel='Lag', ylabel='Autocorrelation'>
In [42]:
transact_small['Revenue'].corr(transact_small['Revenue'].shift(50))
Out[42]:
-0.004421155977020592
In [45]:
crime_rate['Violent crime rate'].corr(crime_rate['Violent crime rate'].shift(30))
Out[45]:
-0.9654049596830483
In [46]:
from statsmodels.tsa.ar_model import AutoReg
#to set up training set for time series data, use the first 80% of the data, and test is the last 20%. don't randomize.
model = AutoReg(crime_rate['Violent crime rate'],1, old_names=False)
model_fitted = model.fit()
In [47]:
model_fitted.params
Out[47]:
const                    0.000291
Violent crime rate.L1    0.949613
dtype: float64
In [48]:
from statsmodels.graphics.tsaplots import plot_pacf
In [49]:
plot_pacf(crime_rate['Violent crime rate'], lags=20)
plt.xlabel('Lags', fontsize=12)
plt.ylabel('Partial Autocorrelation', fontsize=12)
plt.show()
#based on the graph below, use lags of 1 and 2 in the model at least
In [50]:
from statsmodels.tsa.stattools import adfuller

result = adfuller(crime_rate['Violent crime rate'])
print('p-value: %.2f' % result[1])
p-value: 0.24
In [51]:
crime_rate['Difference'] = crime_rate['Violent crime rate'].diff()

result = adfuller(crime_rate['Difference'].dropna())
print('p-value: %.2f' % result[1])
p-value: 0.03
In [52]:
model = AutoReg(crime_rate['Difference'].dropna(),2, old_names=False)
model_fitted = model.fit()
C:\Users\Betsy\Anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:579: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  ' ignored when e.g. forecasting.', ValueWarning)
In [53]:
model_fitted = model.fit()
In [54]:
model_fitted.params
Out[54]:
const            0.000014
Difference.L1    0.669151
Difference.L2   -0.030646
dtype: float64
In [ ]: