import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import random as random
from statsmodels.distributions.empirical_distribution import ECDF

#b)

def pdf(x):
    if 0 <= x < 2:
        return(1/4)
    elif 2 <= x <= np.sqrt(8):
        return(1/4*x)
    else:
        return(0)
    
def cdf(x):
    if x<=0:
        return(0)
    elif x>=0 and x<=2:
        return(1/4*x)
    elif x>=0 and x<=2:
        return((x^2)/8)
    else:
        return(0)
    
x = np.linspace(0, np.sqrt(8), 1000) 

fx=list(map(pdf,x))

#CDF can be calculated from PDF
fx_nor =list(map(pdf,x))
fx_nor /= np.array(fx).sum()     
Fx=np.cumsum(fx_nor)

#or use defined function cdf(x)
#Fx=list(map(cdf,x))

#plot both
plt.subplot(1, 2, 1)
plt.plot(x,fx,"g-",label="pdf")
plt.xlabel("x")
plt.ylabel("probability density")
plt.legend()
plt.title("PDF")

plt.subplot(1, 2, 2)
plt.plot(x,fx,"bo",label="density")
plt.plot(x,Fx,"r-",label="cdf")
plt.xlabel("x")
plt.ylabel("probability")
plt.legend()
plt.title("CDF")
plt.tight_layout()
plt.show()


#c define inverse CDF

def inv_cdf(x):
    if 0 <= x < 1/2:
        return 4*x
    elif 1/2 <= x <= 1:
        return np.sqrt(8*x)
    return 0.0 

x_inv = np.linspace(0, 1, 1000) 
Fx_inv = list(map(inv_cdf,x_inv))

plt.subplot(1, 2, 2)
plt.plot(x_inv, Fx_inv,"b",label="Inverse CDF")
#plt.plot(x,Fx,"r-",label="cdf")
plt.xlabel("x")
plt.ylabel("probability")
plt.legend()
plt.title("Inverse CDF")
plt.tight_layout()
plt.show()


#one can also calculate population mean by the law of the unconscious statistician (LOTUS)
print("Population mean by taking integral:",1.5)      
#calculating the integral of the pdf yields 1.5

#using population normalized pdf times interval all summed up
pop_mean=(fx_nor*x).sum()
print("Population mean per definiton of PDF:",pop_mean)

Population mean by taking integral: 1.5
Population mean per definiton of PDF: 1.7194336609982677


np.random.seed(1234) 
sam = list(map(inv_cdf, np.random.random(100)))
len(sam)

#f
print("Sample mean:", np.mean(sam))
print("Bias:", np.absolute(np.mean(sam)-pop_mean))

#g
S=np.array(sam)
S = np.expand_dims(S, axis=1)

print("Length of S before:", len(S))

np.random.seed(1234)
while np.absolute(np.mean(S)-pop_mean) > 0.01:
    S=np.append(S, list(map(inv_cdf, np.random.random(1))))
else:
    print("Length of S after", len(S))

Sample mean: 1.7927925397287796
Bias: 0.07335887873051194
Length of S before: 100
Length of S after 783


#i
S=np.array(sam)
S = np.expand_dims(S, axis=1)

M = []

np.random.seed(1234)
while np.absolute(np.mean(S)-pop_mean) > 0.01:
    S=np.append(S, list(map(inv_cdf, np.random.random(1))))
    M=np.append(M, np.mean(S))
else:
    print("# Iterations:", len(M))
    
M = np.array(M)

fig=plt.figure()
plt.plot(M)
plt.axhline(y=pop_mean, color='r', linestyle='--', label="true value")
fig.suptitle("Convergence")
plt.ylabel("mean")
plt.xlabel("Loop Iterations")
plt.show()

# Iterations: 683


#j)
x = np.linspace(0, np.sqrt(8), 1000) 

def ecdf(D,x):
    y=list(map(ECDF(D),x))
    return y

plt.figure()
plt.plot(x, ecdf(S,x), label="ECDF sample")
plt.plot(np.sort(x),Fx,'r--', label="CDF pop.")
plt.xlabel("x")
plt.ylabel("cumulative probability")
plt.legend()
plt.title("ECDF compared to $CDF$")

Text(0.5, 1.0, 'ECDF compared to $CDF$')


# for a large sample
sam_large = list(map(inv_cdf, np.random.random(10000)))

plt.figure()
plt.plot(x, ecdf(sam_large,x), label="ECDF large sample")
plt.plot(np.sort(x),Fx,'r--', label="CDF pop.")
plt.xlabel("x")
plt.ylabel("cumulative probability")
plt.legend()
plt.title("ECDF compared to $CDF$ Large Sample")

Text(0.5, 1.0, 'ECDF compared to $CDF$ Large Sample')


urn = ['white','white','black','black','white','black','black','black']
print("Ball in the Urn:", urn)

Ball in the Urn: ['white', 'white', 'black', 'black', 'white', 'black', 'black', 'black']


mean_pop=1.875


import numpy as np

np.random.seed(1234)
draw=np.random.choice(urn, size=5, replace=True, p=[1/8, 1/8, 1/8,1/8,1/8,1/8,1/8,1/8])
print("A single draw from the urn:", draw)
print("Count of white balls drawn:", "X =",list(draw).count("white"))

A single draw from the urn: ['white' 'white' 'black' 'black' 'black']
Count of white balls drawn: X = 2


def draw(N,urn):
        draw=np.random.choice(urn, size=5, replace=True, p=[1/8, 1/8, 1/8,1/8,1/8,1/8,1/8,1/8])
        X=list(draw).count("white")
        return X


N=100

S = np.empty(N)

np.random.seed(7)
for i in range(N):
    S[i]=draw(N,urn)
    
print("My Sample:", S)
print(np.mean(S))
bias=mean_pop-np.mean(S)
bias

My Sample: [1. 3. 1. 2. 3. 2. 1. 1. 0. 1. 1. 3. 2. 1. 1. 1. 1. 1. 1. 1. 2. 1. 3. 3.
 3. 2. 2. 1. 3. 4. 3. 2. 1. 2. 2. 3. 3. 1. 2. 2. 2. 1. 2. 1. 3. 1. 2. 2.
 1. 1. 3. 2. 2. 0. 2. 2. 3. 3. 4. 4. 2. 2. 0. 2. 2. 2. 3. 2. 1. 2. 2. 2.
 1. 3. 1. 1. 0. 0. 1. 2. 0. 2. 1. 3. 0. 1. 1. 2. 4. 2. 1. 2. 0. 3. 2. 5.
 1. 2. 3. 1.]
1.81

0.06499999999999995


new_sam=np.array([1, 3, 3, 1, 2, 2, 2, 3, 4, 1, 1, 1, 1, 2, 1, 4, 4, 1, 4, 3, 3, 3,
       2, 1, 2, 3, 1, 4, 4, 2, 4, 1, 2, 2, 1, 0, 2, 2, 4, 3, 3, 2, 2, 1,
       1, 2, 4, 3, 4, 1, 2, 3, 1, 2, 1, 5, 5, 3, 2, 1, 2, 2, 2, 2, 2, 1,
       3, 0, 2, 2, 2, 1, 4, 1, 1, 0, 3, 3, 1, 1, 2, 2, 2, 4, 2, 3, 4, 4,
       0, 2, 1, 2, 1, 2, 4, 3, 1, 3, 1, 3])
print("New Sample:", new_sam)

New Sample: [1 3 3 1 2 2 2 3 4 1 1 1 1 2 1 4 4 1 4 3 3 3 2 1 2 3 1 4 4 2 4 1 2 2 1 0 2
 2 4 3 3 2 2 1 1 2 4 3 4 1 2 3 1 2 1 5 5 3 2 1 2 2 2 2 2 1 3 0 2 2 2 1 4 1
 1 0 3 3 1 1 2 2 2 4 2 3 4 4 0 2 1 2 1 2 4 3 1 3 1 3]


print("Mean New Sample:", np.mean(new_sam), "\n", "Mean My Sample:", np.mean(S), "\n", "Mean of Population:", 5*3/8)

Mean New Sample: 2.19 
 Mean My Sample: 1.81 
 Mean of Population: 1.875


import scipy.stats as stats

(1.875 -np.sqrt(np.var(S)/len(S))*stats.norm.ppf(q = 0.9), 1.875+np.sqrt(np.var(S)/len(S))*stats.norm.ppf(q = 0.9))

(1.7434363881566644, 2.006563611843336)


from sklearn import datasets
import pandas as pd
import numpy as np

boston = datasets.load_boston()
regressors = pd.DataFrame(boston.data, columns=boston.feature_names)
outcome = pd.DataFrame(boston.target, columns=["MEDV"]).values[:]


regressors.head()


print("1.Observations, Regressors:")
print(regressors.shape)
print("2.Means of Regressors:")
print(regressors.mean())
print("3. Standard Deviations of Regressors")
print(regressors.std())
print(" ")
print("The mean price of houses in the boston data set is: ", round(outcome.mean()[0],2))
print("The standard deviation of the boston housing data sets' prices is: ", round(outcome.std()[0],2))

1.Observations, Regressors:
(506, 13)
2.Means of Regressors:
CRIM         3.613524
ZN          11.363636
INDUS       11.136779
CHAS         0.069170
NOX          0.554695
RM           6.284634
AGE         68.574901
DIS          3.795043
RAD          9.549407
TAX        408.237154
PTRATIO     18.455534
B          356.674032
LSTAT       12.653063
dtype: float64
3. Standard Deviations of Regressors
CRIM         8.601545
ZN          23.322453
INDUS        6.860353
CHAS         0.253994
NOX          0.115878
RM           0.702617
AGE         28.148861
DIS          2.105710
RAD          8.707259
TAX        168.537116
PTRATIO      2.164946
B           91.294864
LSTAT        7.141062
dtype: float64
 
The mean price of houses in the boston data set is:  22.53
The standard deviation of the boston housing data sets' prices is:  9.2


import matplotlib.pyplot as plt

plt.hist(outcome, bins = 10)
plt.title('3.: Density of MEDV')
plt.ylabel('Frequency')
plt.xlabel('median value of owner-occupied homes (MEDV)')
plt.show()


outcome = pd.DataFrame(boston.target, columns=["MEDV"])
print("4.:")
print("Average Value of House over 50 years of age:", round(outcome.loc[regressors.AGE >= 50].MEDV.mean(),2))
print("Average Value of House under 50 years of age:", round(outcome.loc[regressors.AGE < 50].MEDV.mean(),2))

4.:
Average Value of House over 50 years of age: 20.83
Average Value of House under 50 years of age: 26.69


import scipy.stats as stats

print(outcome.mean())
print(stats.ttest_1samp(outcome, 20))

MEDV    22.532806
dtype: float64
Ttest_1sampResult(statistic=array([6.19478358]), pvalue=array([1.21149443e-09]))


chas_hous=outcome.loc[regressors.CHAS == 1].MEDV
non_chas_hous=outcome.loc[regressors.CHAS == 0].MEDV

print("House located on Charles River:", chas_hous.mean())
print("House not located on Charles River:", np.round(non_chas_hous.mean()),2)

print(stats.ttest_ind(chas_hous, non_chas_hous, equal_var= True))
print(stats.ttest_ind(chas_hous, non_chas_hous, equal_var= False))

House located on Charles River: 28.44
House not located on Charles River: 22.0 2
Ttest_indResult(statistic=3.996437466090509, pvalue=7.390623170519905e-05)
Ttest_indResult(statistic=3.113291312794837, pvalue=0.003567170098137517)


stats.ks_2samp(chas_hous, non_chas_hous)

KstestResult(statistic=0.27788898999090084, pvalue=0.010198373575669883)


lower_stat=regressors.loc[regressors.LSTAT > 10].CRIM
non_lower_stat=regressors.loc[regressors.LSTAT <= 10].CRIM

print("Per capita crime rate by town when share of low status is above 10%:", lower_stat.mean())
print("Per capita crime rate by town when share of low status is equal or below 10%:", non_lower_stat.mean())

print(stats.ttest_ind(lower_stat, non_lower_stat, equal_var= True, alternative='greater'))

Per capita crime rate by town when share of low status is above 10%: 6.010182822299653
Per capita crime rate by town when share of low status is equal or below 10%: 0.47269611872146106
Ttest_indResult(statistic=7.563826404064134, pvalue=9.346525153507563e-14)


print(stats.shapiro(outcome))

ShapiroResult(statistic=0.91717529296875, pvalue=4.940618243974614e-16)


import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chi2
import math

np.random.seed(0)
sample = np.random.normal(0, np.sqrt(4), 100)
#true_var = np.var(sample)
true_var = np.mean((sample-np.mean(sample))**2) * len(sample)/(len(sample)-1)


print("Variance:", true_var)
print("Standard Deviation:", np.sqrt(true_var))

print('Confidence Interval :',(np.round((100*true_var)/stats.chi2.ppf(1-0.05 / 2, 100-1),3),np.round((100*true_var)/stats.chi2.ppf(0.05 / 2, 100-1) ,3)))

intervals = []
vars = []

np.random.seed(0)
for i in range(20):
    sample = np.random.normal(0, np.sqrt(4), 100)
    interval = ((100*np.var(sample))/stats.chi2.ppf(1-0.05 / 2, 100-1),(100*np.var(sample))/stats.chi2.ppf(0.05 / 2, 100-1))
    intervals.append(interval)
    var = np.var(sample)
    vars.append(var)
    
plt.figure(figsize=(10,4))
plt.errorbar(x=np.arange(0.1, 20, 1), y=vars, yerr=[(top-bot)/2 for top,bot in intervals], fmt='o')
plt.hlines(xmin=0, xmax=20, y=true_var, linewidth=2.0, color="red")
plt.title("Confidence Interval for the Variance")
plt.xlabel("Repetitions")
plt.ylabel("Width of Confidence Interval")
x_coordinates = [0, 19]
plt.show()

Variance: 4.1043499766259846
Standard Deviation: 2.0259195385370035
Confidence Interval : (3.196, 5.595)


def sigma_test(sample, sigma, alpha):
    interval = ((len(sample)*np.var(sample))/stats.chi2.ppf(1-alpha / 2, len(sample)-1),(len(sample)*np.var(sample))/stats.chi2.ppf(alpha / 2, len(sample)-1))
    if interval[0] > sigma or interval[1] < sigma:
        result = 1
    else:
        result = 0
    return result


count= 0

np.random.seed(123)
for i in range(1000):
    sample = np.random.normal(0, np.sqrt(4), 100)
    result = sigma_test(sample, 4, 0.05)          
    count += result

print("percentage of rejected null hypothesis:", count/1000*100)

percentage of rejected null hypothesis: 5.1


count= 0

np.random.seed(0)
for i in range(1000):
    sample = np.random.normal(0, np.sqrt(4), 100)
    result = sigma_test(sample, 2, 0.05)                                              #test variance=2 
    count += result

print("percentage of rejected null hypothesis:", count/1000*100)

percentage of rejected null hypothesis: 99.9


test_stats = []

np.random.seed(0)
for i in range(1000):
    sample = np.random.normal(0, np.sqrt(4), 100)
    test_stat=(len(sample)*np.var(sample))/4 * len(sample)/(df)
    test_stats.append(test_stat)


df=100-1

x = np.linspace(chi2.ppf(0.01, df), chi2.ppf(0.99, df), 100)                  #choice
#x = np.arange(min(test_stats), max(test_stats), .05)

fig, ax = plt.subplots(1, 1)
ax.hist(test_stats, density=True, histtype='stepfilled', alpha=0.2)              #choice
#ax.hist(test_stats, 50, weights=np.zeros_like(test_stats) +1 / 1000)

ax.plot(x, chi2.pdf(x, df),'r-', lw=5, alpha=0.6, label='chi2 pdf')
plt.axvline(stats.chi2.ppf(1-0.05 / 2, 100-1), label="upper $\chi^2_{n-1,1-/alpha/2}$")
plt.axvline(stats.chi2.ppf(0.05 / 2, 100-1), label="upper $\chi^2_{n-1,/alpha/2}$")
ax.legend(loc='best', frameon=False)
plt.show()


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np

data = pd.read_csv('dataJC.csv', sep=',', na_values=".", index_col=None)

columnnames = ['assignment', 'female', 'age', 'white', 'black', 'hispanic', 'educ',
       'educmis', 'geddegree', 'hsdegree', 'english', 'cohabmarried',
       'haschild', 'everwkd', 'mwearn', 'hhsize', 'hhsizemis', 'educmum',
       'educmummis', 'educdad', 'educdadmis', 'welfarechild',
       'welfarechildmis', 'health', 'healthmis', 'smoke', 'smokemis',
       'alcohol', 'alcoholmis']

X = pd.DataFrame(data, columns=columnnames) 

y_class = pd.DataFrame(np.where(data.health30 == 1, 1, 0)) 
Share = y_class.sum()[0]/len(y_class)*100
print('Share of people with excellent health status:')
print(round(Share,2))

Share of people with excellent health status:
38.27


y_pred=pd.DataFrame(data["earny3"])
y_pred=y_pred.rename(columns={"earny3": "y_pred"})

print("Health Status Treated:", round(y_class.loc[data.assignment == 1].mean()[0],3))
print("Health Status Not Treated:", round(y_class.loc[data.assignment == 0].mean()[0],3))
print("")
print("Weekly Earnings Treated:", round(y_pred.loc[data.assignment == 1].mean()[0],3))
print("Weekly Earnings Not Treated:", round(y_pred.loc[data.assignment == 0].mean()[0],3))

Health Status Treated: 0.387
Health Status Not Treated: 0.376

Weekly Earnings Treated: 177.477
Weekly Earnings Not Treated: 166.014


import scipy.stats as stats
data.loc[data['health30'] == 1, 'Y_class'] = 1 
data.loc[data['health30'] !=1, 'Y_class'] = 0
Y_class = data.filter(['Y_class'], axis = 1)

Yclass1 = data.loc[data['assignment'] == 1, 'Y_class']
Yclass0 = data.loc[data['assignment'] == 0, 'Y_class']

Ypred1 = data.loc[data['assignment'] == 1, 'earny3']
Ypred0 = data.loc[data['assignment'] == 0, 'earny3']

print('Test Result for mean comparison of Y_class:')
sample_class_1 = Yclass1
sample_class_0 = Yclass0
stats.ttest_ind(sample_class_1,sample_class_0,equal_var= False)
print('\n')

print('Test Result for mean comparison of Y_pred:')
sample_pred_1 = Ypred1
sample_pred_0 = Ypred0
stats.ttest_ind(sample_pred_1,sample_pred_0,equal_var= False)

Test Result for mean comparison of Y_class:

Ttest_indResult(statistic=1.0412579548844745, pvalue=0.2977878613858475)


Test Result for mean comparison of Y_pred:

Ttest_indResult(statistic=3.328573871176786, pvalue=0.0008768832000587824)


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_pred, test_size=0.2, random_state=7)
#print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


from sklearn.linear_model import LinearRegression
def RMSE(y, y_hat):
    result = np.sqrt(np.mean((y-y_hat)**2))
    result = round(result, 2)
    result = pd.Series(result)
    return(result)

y_train = np.squeeze(y_train)
y_test = np.squeeze(y_test)

model = LinearRegression().fit(X_train, y_train)
preds_train = model.predict(X_train)
preds_test = model.predict(X_test)
ols = pd.concat((RMSE(y_train, preds_train), RMSE(y_test, preds_test)), axis=1)

print("RMSE Train-Set:", ols[0][0])
print("RMSE Test-Set:",ols[1][0])
print("Standard Deviation of Earnings (y_pred):", np.round(np.std(y_pred)[0],2))
print("RMSE Naive Estimator:", RMSE(np.mean(y_pred),y_pred)[0])
print("Standard Deviation of Earnings (preds_test):", np.round(np.std(preds_test),2))

RMSE Train-Set: 153.06
RMSE Test-Set: 150.52
Standard Deviation of Earnings (y_pred): 162.77
RMSE Naive Estimator: 162.77
Standard Deviation of Earnings (preds_test): 57.43


from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNetCV

model = LassoCV().fit(X_train, y_train)
preds_train = model.predict(X_train)
preds_test = model.predict(X_test)
lasso = pd.concat((RMSE(y_train, preds_train), RMSE(y_test, preds_test)), axis=1)

model = RidgeCV().fit(X_train, y_train)
preds_train = model.predict(X_train)
preds_test = model.predict(X_test)
ridge = pd.concat((RMSE(y_train, preds_train), RMSE(y_test, preds_test)), axis=1)

model = ElasticNetCV().fit(X_train, y_train)
preds_train = model.predict(X_train)
preds_test = model.predict(X_test)
enet = pd.concat((RMSE(y_train, preds_train), RMSE(y_test, preds_test)), axis=1)

summary = pd.concat((ols, lasso, ridge, enet))
summary.index = ['ols','lasso', 'ridge',"elastic net"]
summary.columns = ['In Sample', 'Out of Sample']
summary


import statsmodels.api as sm

X_c = sm.add_constant(X)
model = sm.OLS(y_pred,X_c)
model = model.fit()
model.summary()


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn import neighbors
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.2, random_state=7)
y_train = np.squeeze(y_train)
y_test = np.squeeze(y_test)


penalty_vary = ['l1', 'l2']
c_vary = [0.001, 0.01, 0.1, 1, 10]
for c in c_vary:
    for Penalty in penalty_vary:
        logisticRegr = LogisticRegression(max_iter = 10000, solver = 'liblinear', penalty = Penalty, C=c)
        model_log = logisticRegr.fit(X_train, y_train)
        pred3 = model_log.predict(X_test)
        print("accuracy score for c=", c,',','Penalty=', Penalty, ':', round(metrics.accuracy_score(y_test, pred3),2))  

pred1 = LogisticRegression(max_iter = 10000, solver="liblinear", penalty = "l2", C= 0.01).fit(X_train, y_train).predict(X_test)
pred2 = LogisticRegression(max_iter = 10000, solver="liblinear", penalty="l1", C = 0.1).fit(X_train, y_train).predict(X_test)
pred3 = LogisticRegression(max_iter = 10000, solver="newton-cg", penalty = "l2", C= 0.01).fit(X_train, y_train).predict(X_test)
logregl1 = pd.Series(accuracy_score(y_test,pred2))
logregl2 = pd.Series(accuracy_score(y_test,pred1))
logregcg = pd.Series(accuracy_score(y_test,pred3))

accuracy score for c= 0.001 , Penalty= l1 : 0.61
accuracy score for c= 0.001 , Penalty= l2 : 0.62
accuracy score for c= 0.01 , Penalty= l1 : 0.62
accuracy score for c= 0.01 , Penalty= l2 : 0.65
accuracy score for c= 0.1 , Penalty= l1 : 0.65
accuracy score for c= 0.1 , Penalty= l2 : 0.64
accuracy score for c= 1 , Penalty= l1 : 0.65
accuracy score for c= 1 , Penalty= l2 : 0.65
accuracy score for c= 10 , Penalty= l1 : 0.65
accuracy score for c= 10 , Penalty= l2 : 0.65


#Different degrees for Polynomial Kernel
degree_vary = [1,2]
for Degree in degree_vary:
    pred_pol = svm.SVC(kernel= 'poly', degree = Degree, gamma='scale').fit(X_train, y_train).predict(X_test)
    print("accuracy score polynomial for degree=", Degree, ':', round(accuracy_score(y_test,pred_pol),2))

#Varying C in RBF, no difference in Sigmoid 
c_vary = [0.01, 0.1, 0.5, 1, 10]
for c in c_vary:
    pred_rbf = svm.SVC(kernel = 'rbf', C = c, gamma='auto').fit(X_train, y_train).predict(X_test)
    print('accuracy score rbf for C=', c, ':', round(accuracy_score(y_test,pred_rbf),2))

pred_rbf = svm.SVC(kernel = 'rbf', C = 1, gamma='auto').fit(X_train, y_train).predict(X_test)
pred_sig = svm.SVC(kernel = 'sigmoid', C = 1, gamma='auto').fit(X_train, y_train).predict(X_test)
pred_pol = svm.SVC(kernel = 'poly', degree = 2, gamma='scale').fit(X_train, y_train).predict(X_test)
svm1 = pd.Series(accuracy_score(y_test,pred_rbf))
svm2 = pd.Series(accuracy_score(y_test,pred_sig)) 
svm3 = pd.Series(accuracy_score(y_test,pred_pol))

accuracy score polynomial for degree= 1 : 0.61
accuracy score polynomial for degree= 2 : 0.61
accuracy score rbf for C= 0.01 : 0.61
accuracy score rbf for C= 0.1 : 0.61
accuracy score rbf for C= 0.5 : 0.61
accuracy score rbf for C= 1 : 0.62
accuracy score rbf for C= 10 : 0.59


N = [1, 10, 20, 30, 40, 50, 100, 1000, 2000]
for N_neighbors in N:
    neighb = neighbors.KNeighborsClassifier(n_neighbors = N_neighbors, weights = 'uniform')
    model_knn = neighb.fit(X_train, y_train)
    preds3 = model_knn.predict(X_test)
    print("Accuracy for neighbors =", N_neighbors, ":", round(metrics.accuracy_score(y_test, preds3),2))

preds1 = neighbors.KNeighborsClassifier(n_neighbors = 100, weights = 'uniform').fit(X_train, y_train).predict(X_test)
preds1_train = neighbors.KNeighborsClassifier(n_neighbors = 100, weights = 'uniform').fit(X_train, y_train).predict(X_train)
preds2 = neighbors.KNeighborsClassifier(n_neighbors = 100, weights = 'distance').fit(X_train, y_train).predict(X_test)
knn1=pd.Series(accuracy_score(y_test,preds1))
knn1_train = pd.Series(accuracy_score(y_train,preds1_train))
knn2=pd.Series(accuracy_score(y_test,preds2))

Accuracy for neighbors = 1 : 0.53
Accuracy for neighbors = 10 : 0.59
Accuracy for neighbors = 20 : 0.59
Accuracy for neighbors = 30 : 0.59
Accuracy for neighbors = 40 : 0.6
Accuracy for neighbors = 50 : 0.61
Accuracy for neighbors = 100 : 0.62
Accuracy for neighbors = 1000 : 0.61
Accuracy for neighbors = 2000 : 0.61


depth = [1, 3, 7, 10, 20]
for Depth in depth:
    model_d = RandomForestClassifier(max_depth=Depth, random_state=0, n_estimators = 1000).fit(X_train, y_train)
    preds_d= model_d.predict(X_test)
    print("Accuracy for max_depth =", Depth, ':', round(metrics.accuracy_score(y_test, preds_d),2))


print('\n')
print('Importance of features')
model = RandomForestClassifier(max_depth=7, random_state=0, n_estimators = 1000).fit(X_train, y_train)
importance = model.feature_importances_
preds1= model.predict(X_test)
preds2 = RandomForestClassifier(max_depth=3, random_state=0, n_estimators = 1000).fit(X_train, y_train).predict(X_test)

rf1=pd.Series(accuracy_score(y_test,preds1))
rf2=pd.Series(accuracy_score(y_test,preds2))

Accuracy for max_depth = 1 : 0.61
Accuracy for max_depth = 3 : 0.61
Accuracy for max_depth = 7 : 0.64
Accuracy for max_depth = 10 : 0.63
Accuracy for max_depth = 20 : 0.63


Importance of features


for i,v in enumerate(importance):
    if v > 0.04:
        print('Feature: %s, Score: %.5f' % (X_train.columns[i],v))
    
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

Feature: female, Score: 0.06366
Feature: age, Score: 0.04833
Feature: educ, Score: 0.04097
Feature: mwearn, Score: 0.04412
Feature: hhsize, Score: 0.05416
Feature: educmum, Score: 0.04246
Feature: health, Score: 0.40998

<BarContainer object of 29 artists>


summary = pd.concat((logregl1, logregl2, logregcg, svm1, svm2, svm3, knn1, knn2, rf1, rf2))
summary = round(summary,3)
summary.index = ['Logistic Regression liblinear l1', 'Logistic Regression liblinear l2', 'Logistic Regression newton-cg l2','Support Vector Machine RBF', 
                 'Support Vector Machine Sigmoid', 'Support Vector Machine Poly' ,'100 K-Neigbors Uniform', '100 K-Neigbors Distance', 'Random Forest Depth 7','Random Forest Depth 3']
summary.columns = ['Out of Sample', 'Sample']
summary = pd.DataFrame(summary, columns=['Test Set Accuracy'])
summary

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33

Dep. Variable:	y_pred	R-squared:	0.122
Model:	OLS	Adj. R-squared:	0.119
Method:	Least Squares	F-statistic:	44.24
Date:	Thu, 01 Jul 2021	Prob (F-statistic):	9.84e-235
Time:	14:34:28	Log-Likelihood:	-59562.
No. Observations:	9240	AIC:	1.192e+05
Df Residuals:	9210	BIC:	1.194e+05
Df Model:	29
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-8.9592	24.676	-0.363	0.717	-57.330	39.411
assignment	14.2214	3.264	4.357	0.000	7.823	20.620
female	-57.6810	3.451	-16.713	0.000	-64.446	-50.916
age	6.4151	0.953	6.732	0.000	4.547	8.283
white	32.9201	7.119	4.624	0.000	18.965	46.876
black	-19.6873	6.817	-2.888	0.004	-33.051	-6.324
hispanic	10.0964	7.183	1.406	0.160	-3.984	24.177
educ	9.8711	1.492	6.617	0.000	6.947	12.795
educmis	103.7038	38.645	2.683	0.007	27.950	179.458
geddegree	30.0881	7.746	3.884	0.000	14.905	45.272
hsdegree	8.2795	5.443	1.521	0.128	-2.391	18.950
english	2.4089	6.057	0.398	0.691	-9.463	14.281
cohabmarried	9.7798	6.902	1.417	0.157	-3.750	23.310
haschild	1.5944	4.521	0.353	0.724	-7.267	10.456
everwkd	-27.4785	4.602	-5.971	0.000	-36.499	-18.458
mwearn	0.0954	0.018	5.230	0.000	0.060	0.131
hhsize	-1.2020	0.814	-1.477	0.140	-2.798	0.394
hhsizemis	-16.7310	25.278	-0.662	0.508	-66.282	32.820
educmum	0.8241	0.791	1.042	0.298	-0.727	2.375
educmummis	1.1527	10.041	0.115	0.909	-18.531	20.836
educdad	0.7879	0.792	0.995	0.320	-0.764	2.339
educdadmis	1.3585	9.713	0.140	0.889	-17.680	20.397
welfarechild	-9.6148	1.468	-6.548	0.000	-12.493	-6.736
welfarechildmis	-28.8459	7.618	-3.786	0.000	-43.779	-13.913
health	-11.7220	2.275	-5.153	0.000	-16.181	-7.263
healthmis	11.6371	38.980	0.299	0.765	-64.772	88.046
smoke	-1.4975	2.289	-0.654	0.513	-5.985	2.990
smokemis	11.7109	5.325	2.199	0.028	1.272	22.150
alcohol	2.3361	1.746	1.338	0.181	-1.087	5.759
alcoholmis	-6.4394	6.524	-0.987	0.324	-19.228	6.349

Omnibus:	3583.727	Durbin-Watson:	1.962
Prob(Omnibus):	0.000	Jarque-Bera (JB):	29935.318
Skew:	1.635	Prob(JB):	0.00
Kurtosis:	11.189	Cond. No.	2.85e+03

Term Paper: Statistical Programming¶

M.Sc. Business Administration¶

University of Hamburg¶

1. Distributions¶

2. Hypothesis Tests¶

2.1 Urn Experiment¶

Distribution Modeling¶

2.2 Boston Housing Data Set¶

Descriptive Statistics¶

Inference Statistics¶

3. Functions and Simulations¶

4. US Job Corps program¶

Data Preparation¶

Regression¶

Classification¶

Logistic Regression:¶

Suport Vector Machines¶

$k$-nearest neighbor¶

Random Forest¶

	In Sample	Out of Sample
ols	153.06	150.52
lasso	153.73	151.21
ridge	153.07	150.55
elastic net	157.06	155.26

	Test Set Accuracy
Logistic Regression liblinear l1	0.647
Logistic Regression liblinear l2	0.646
Logistic Regression newton-cg l2	0.648
Support Vector Machine RBF	0.619
Support Vector Machine Sigmoid	0.612
Support Vector Machine Poly	0.613
100 K-Neigbors Uniform	0.616
100 K-Neigbors Distance	0.616
Random Forest Depth 7	0.639
Random Forest Depth 3	0.613