import pandas as pd
import numpy as np
df = pd.read_excel('survey_data2.xlsx')
df=df[['Age','Gender','State','Children','Salary','Agree or Not']]
df = pd.concat([df,pd.get_dummies(df['Age'], prefix='_', drop_first=True)],axis=1)
df.drop(['Age'],axis=1, inplace=True)
df = pd.concat([df,pd.get_dummies(df['State'], prefix='State', drop_first=True)],axis=1)
df.drop(['State'],axis=1, inplace=True)
df['Gender']=df['Gender']-1
df_copy = df.copy()
train_set = df_copy.sample(frac=0.80, random_state=0) 
test_set = df_copy.drop(train_set.index)
test_set_labels = test_set.pop('Agree or Not')
train_set_labels = train_set.pop('Agree or Not')


from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(train_set)


mlp = MLPClassifier(hidden_layer_sizes=(17,17,17,17),max_iter=1000) #13 is the number of nodes, with 3 layers
mlp.fit(scaled_features,train_set_labels)
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(17, 17, 17, 17), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
predictions = mlp.predict(train_set) #to predict with the test set you'd also have to scale it

C:\Users\Betsy\Anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)


from sklearn.metrics import classification_report,confusion_matrix


print(confusion_matrix(train_set_labels, predictions))

[[184   0]
 [135   0]]


print(classification_report(train_set_labels, predictions))

              precision    recall  f1-score   support

           0       0.58      1.00      0.73       184
           1       0.00      0.00      0.00       135

    accuracy                           0.58       319
   macro avg       0.29      0.50      0.37       319
weighted avg       0.33      0.58      0.42       319

C:\Users\Betsy\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))


import re


text="The rain in Spain is mainly on the plain."
x=re.findall("ai",text)
x

['ai', 'ai', 'ai', 'ai']


x=re.split("\s", text)
x

['The', 'rain', 'in', 'Spain', 'is', 'mainly', 'on', 'the', 'plain.']


x=re.split("\s", text, 1)
x

['The', 'rain in Spain is mainly on the plain.']


x=re.sub('\s', "-", text)
x

'The-rain-in-Spain-is-mainly-on-the-plain.'


x=re.search(r"\bS\w+", text)
x.span()

(12, 17)


print(x.string)

The rain in Spain is mainly on the plain.


print(x.group())

Spain


x=re.findall('[mat]', text)
x

['a', 'a', 'm', 'a', 't', 'a']


x=re.findall('ain+', text)
x

['ain', 'ain', 'ain', 'ain']


#https://www.w3schools.com/python/python_regex.asp more code keys here


crimedf = pd.read_excel('crime_data.xlsx')
transactdf=pd.read_excel('transaction_data.xlsx')
crimedf.head()


transactdf.head()


crimedf.dtypes

Year                                    int64
Population                              int64
Violent crime total                     int64
Murder and nonnegligent manslaughter    int64
Forcible rape                           int64
Robbery                                 int64
Aggravated assault                      int64
Property crime total                    int64
Burglary                                int64
Larceny-theft                           int64
Motor vehicle theft                     int64
dtype: object


transactdf.dtypes

Transaction                    int64
Purchase Date         datetime64[ns]
Customer ID                    int64
Gender                        object
Marital Status                object
Homeowner                     object
Children                       int64
Annual Income                 object
City                          object
State or Province             object
Country                       object
Product Family                object
Product Department            object
Product Category              object
Units Sold                     int64
Revenue                      float64
dtype: object


import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.ar_model import AutoReg, ar_select_order
from statsmodels.tsa.api import acf, pacf, graphics


fig,ax = plt.subplots()
ax = crimedf.plot(ax=ax)


sns.set_style('darkgrid')
pd.plotting.register_matplotlib_converters()
sns.mpl.rc('figure', figsize=(16,6))


fig,ax = plt.subplots()
ax = crimedf.plot(ax=ax)


crime_rate=crimedf.copy()
crime_rate['Violent crime rate'] = crimedf['Violent crime total']/crimedf['Population']
crime_rate['Year']=crimedf['Year']
fig, ax = plt.subplots()
ax = crime_rate['Violent crime rate'].plot(ax=ax)


fig,ax = plt.subplots()
ax = transactdf.plot(ax=ax)


fig,ax = plt.subplots()
ax = transactdf['Revenue'].plot(ax=ax)


transact_small=transactdf.copy()
transact_small.drop(['Transaction'],axis=1, inplace=True)
transact_small.drop(['Customer ID'],axis=1, inplace=True)


import matplotlib.pylab as plt
pd.plotting.lag_plot(transact_small['Revenue'])

<AxesSubplot:xlabel='y(t)', ylabel='y(t + 1)'>


pd.plotting.lag_plot(crime_rate['Violent crime rate'])

<AxesSubplot:xlabel='y(t)', ylabel='y(t + 1)'>


pd.plotting.autocorrelation_plot(crime_rate['Violent crime rate'])

<AxesSubplot:xlabel='Lag', ylabel='Autocorrelation'>


transact_small['Revenue'].corr(transact_small['Revenue'].shift(50))

-0.004421155977020592


crime_rate['Violent crime rate'].corr(crime_rate['Violent crime rate'].shift(30))

-0.9654049596830483


from statsmodels.tsa.ar_model import AutoReg
#to set up training set for time series data, use the first 80% of the data, and test is the last 20%. don't randomize.
model = AutoReg(crime_rate['Violent crime rate'],1, old_names=False)
model_fitted = model.fit()


model_fitted.params

const                    0.000291
Violent crime rate.L1    0.949613
dtype: float64


from statsmodels.graphics.tsaplots import plot_pacf


plot_pacf(crime_rate['Violent crime rate'], lags=20)
plt.xlabel('Lags', fontsize=12)
plt.ylabel('Partial Autocorrelation', fontsize=12)
plt.show()
#based on the graph below, use lags of 1 and 2 in the model at least


from statsmodels.tsa.stattools import adfuller

result = adfuller(crime_rate['Violent crime rate'])
print('p-value: %.2f' % result[1])

p-value: 0.24


crime_rate['Difference'] = crime_rate['Violent crime rate'].diff()

result = adfuller(crime_rate['Difference'].dropna())
print('p-value: %.2f' % result[1])

p-value: 0.03


model = AutoReg(crime_rate['Difference'].dropna(),2, old_names=False)
model_fitted = model.fit()

C:\Users\Betsy\Anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:579: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  ' ignored when e.g. forecasting.', ValueWarning)


model_fitted = model.fit()


model_fitted.params

const            0.000014
Difference.L1    0.669151
Difference.L2   -0.030646
dtype: float64

	Year	Population	Violent crime total	Murder and nonnegligent manslaughter	Forcible rape	Robbery	Aggravated assault	Property crime total	Burglary	Larceny-theft	Motor vehicle theft
0	1960	179323175	288460	9110	17190	107840	154320	3095700	912100	1855400	328200
1	1961	182992000	289390	8740	17220	106670	156760	3198600	949600	1913000	336000
2	1962	185771000	301510	8530	17550	110860	164570	3450700	994300	2089600	366800
3	1963	188483000	316970	8640	17650	116470	174210	3792500	1086400	2297800	408300
4	1964	191141000	364220	9360	21420	130390	203050	4200400	1213200	2514400	472800

	Transaction	Purchase Date	Customer ID	Gender	Marital Status	Homeowner	Children	Annual Income	City	State or Province	Country	Product Family	Product Department	Product Category	Units Sold	Revenue
0	1	2014-12-18	7223	F	S	Y	2	$30K - $50K	Los Angeles	CA	USA	Food	Snack Foods	Snack Foods	5	27.38
1	2	2014-12-20	7841	M	M	Y	5	$70K - $90K	Los Angeles	CA	USA	Food	Produce	Vegetables	5	14.90
2	3	2014-12-21	8374	F	M	N	2	$50K - $70K	Bremerton	WA	USA	Food	Snack Foods	Snack Foods	3	5.52
3	4	2014-12-21	9619	M	M	Y	3	$30K - $50K	Portland	OR	USA	Food	Snacks	Candy	4	4.44
4	5	2014-12-22	1900	F	S	Y	3	$130K - $150K	Beverly Hills	CA	USA	Drink	Beverages	Carbonated Beverages	4	14.00