import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv('50_Startups.csv')
data.head()
R&D Spend | Administration | Marketing Spend | State | Profit | |
---|---|---|---|---|---|
0 | 165349.20 | 136897.80 | 471784.10 | New York | 192261.83 |
1 | 162597.70 | 151377.59 | 443898.53 | California | 191792.06 |
2 | 153441.51 | 101145.55 | 407934.54 | Florida | 191050.39 |
3 | 144372.41 | 118671.85 | 383199.62 | New York | 182901.99 |
4 | 142107.34 | 91391.77 | 366168.42 | Florida | 166187.94 |
fm = data.iloc[:,:-1].values
v = data.iloc[:,-1].values
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[3])],remainder='passthrough')
fm = ct.fit_transform(fm)
test_size = 0.25
random_state = 10
from sklearn.model_selection import train_test_split
fm_train,fm_test,v_train,v_test = train_test_split(fm,v,test_size=test_size,random_state=random_state)
plt.scatter(fm_train[:,3],v_train,color='red')
plt.title('Profit vs R&D Spend')
plt.xlabel('R&D Spend')
plt.ylabel('Profit')
plt.show()
plt.scatter(fm_train[:,4],v_train,color='red')
plt.title('Profit vs Admin Spend')
plt.xlabel('Admin Spend')
plt.ylabel('Profit')
plt.show()
plt.scatter(fm_train[:,5],v_train,color='red')
plt.title('Profit vs Marketing Spend')
plt.xlabel('Marketing Spend')
plt.ylabel('Profit')
plt.show()
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(fm_train,v_train)
LinearRegression()
v_pred = reg.predict(fm_test)
fm = fm[:,1:]
import statsmodels.api as sm
fm = np.append(arr = np.ones((50,1)).astype(int), values = fm, axis = 1)
print(fm[0:3,[0,1,2,3,4,5]])
[[1 0.0 1.0 165349.2 136897.8 471784.1] [1 0.0 0.0 162597.7 151377.59 443898.53] [1 1.0 0.0 153441.51 101145.55 407934.54]]
fm_opt = fm[:, [0, 1, 2, 3, 4, 5]]
fm_opt = fm_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = v, exog = fm_opt).fit()
regressor_OLS.summary()
Dep. Variable: | y | R-squared: | 0.951 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.945 |
Method: | Least Squares | F-statistic: | 169.9 |
Date: | Wed, 27 Apr 2022 | Prob (F-statistic): | 1.34e-27 |
Time: | 18:30:28 | Log-Likelihood: | -525.38 |
No. Observations: | 50 | AIC: | 1063. |
Df Residuals: | 44 | BIC: | 1074. |
Df Model: | 5 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 5.013e+04 | 6884.820 | 7.281 | 0.000 | 3.62e+04 | 6.4e+04 |
x1 | 198.7888 | 3371.007 | 0.059 | 0.953 | -6595.030 | 6992.607 |
x2 | -41.8870 | 3256.039 | -0.013 | 0.990 | -6604.003 | 6520.229 |
x3 | 0.8060 | 0.046 | 17.369 | 0.000 | 0.712 | 0.900 |
x4 | -0.0270 | 0.052 | -0.517 | 0.608 | -0.132 | 0.078 |
x5 | 0.0270 | 0.017 | 1.574 | 0.123 | -0.008 | 0.062 |
Omnibus: | 14.782 | Durbin-Watson: | 1.283 |
---|---|---|---|
Prob(Omnibus): | 0.001 | Jarque-Bera (JB): | 21.266 |
Skew: | -0.948 | Prob(JB): | 2.41e-05 |
Kurtosis: | 5.572 | Cond. No. | 1.45e+06 |
print(fm[0:3,[0,1,3,4,5]])
[[1 0.0 165349.2 136897.8 471784.1] [1 0.0 162597.7 151377.59 443898.53] [1 1.0 153441.51 101145.55 407934.54]]
fm_opt = fm[:, [0, 1, 3, 4, 5]]
fm_opt = fm_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = v, exog = fm_opt).fit()
regressor_OLS.summary()
Dep. Variable: | y | R-squared: | 0.951 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.946 |
Method: | Least Squares | F-statistic: | 217.2 |
Date: | Wed, 27 Apr 2022 | Prob (F-statistic): | 8.49e-29 |
Time: | 18:30:28 | Log-Likelihood: | -525.38 |
No. Observations: | 50 | AIC: | 1061. |
Df Residuals: | 45 | BIC: | 1070. |
Df Model: | 4 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 5.011e+04 | 6647.870 | 7.537 | 0.000 | 3.67e+04 | 6.35e+04 |
x1 | 220.1585 | 2900.536 | 0.076 | 0.940 | -5621.821 | 6062.138 |
x2 | 0.8060 | 0.046 | 17.606 | 0.000 | 0.714 | 0.898 |
x3 | -0.0270 | 0.052 | -0.523 | 0.604 | -0.131 | 0.077 |
x4 | 0.0270 | 0.017 | 1.592 | 0.118 | -0.007 | 0.061 |
Omnibus: | 14.758 | Durbin-Watson: | 1.282 |
---|---|---|---|
Prob(Omnibus): | 0.001 | Jarque-Bera (JB): | 21.172 |
Skew: | -0.948 | Prob(JB): | 2.53e-05 |
Kurtosis: | 5.563 | Cond. No. | 1.40e+06 |
ny_profits = [v[i] for i in range(len(fm[:,2])) if fm[i,2] == 1.0]
cali_profits = [v[i] for i in range(len(fm[:,0])) if fm[i,0] == 1.0]
florida_profits = [v[i] for i in range(len(fm[:,1])) if fm[i,1] == 1.0]
plt.boxplot([ny_profits,cali_profits,florida_profits])
plt.title('Statewise Profit Distribution')
plt.xlabel('New York-California-Florida')
plt.ylabel('Profit Distribution')
plt.show()
len(ny_profits),len(cali_profits),len(florida_profits)
(17, 50, 16)
print(fm[0:3,[0,3,4,5]])
[[1 165349.2 136897.8 471784.1] [1 162597.7 151377.59 443898.53] [1 153441.51 101145.55 407934.54]]
fm_opt = fm[:, [0, 3, 4, 5]]
fm_opt = fm_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = v, exog = fm_opt).fit()
regressor_OLS.summary()
Dep. Variable: | y | R-squared: | 0.951 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.948 |
Method: | Least Squares | F-statistic: | 296.0 |
Date: | Wed, 27 Apr 2022 | Prob (F-statistic): | 4.53e-30 |
Time: | 18:30:29 | Log-Likelihood: | -525.39 |
No. Observations: | 50 | AIC: | 1059. |
Df Residuals: | 46 | BIC: | 1066. |
Df Model: | 3 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 5.012e+04 | 6572.353 | 7.626 | 0.000 | 3.69e+04 | 6.34e+04 |
x1 | 0.8057 | 0.045 | 17.846 | 0.000 | 0.715 | 0.897 |
x2 | -0.0268 | 0.051 | -0.526 | 0.602 | -0.130 | 0.076 |
x3 | 0.0272 | 0.016 | 1.655 | 0.105 | -0.006 | 0.060 |
Omnibus: | 14.838 | Durbin-Watson: | 1.282 |
---|---|---|---|
Prob(Omnibus): | 0.001 | Jarque-Bera (JB): | 21.442 |
Skew: | -0.949 | Prob(JB): | 2.21e-05 |
Kurtosis: | 5.586 | Cond. No. | 1.40e+06 |
print(fm[0:3,[0,3,5]])
[[1 165349.2 471784.1] [1 162597.7 443898.53] [1 153441.51 407934.54]]
fm_opt = fm[:, [0, 3, 5]]
fm_opt = fm_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = v, exog = fm_opt).fit()
regressor_OLS.summary()
Dep. Variable: | y | R-squared: | 0.950 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.948 |
Method: | Least Squares | F-statistic: | 450.8 |
Date: | Wed, 27 Apr 2022 | Prob (F-statistic): | 2.16e-31 |
Time: | 18:30:29 | Log-Likelihood: | -525.54 |
No. Observations: | 50 | AIC: | 1057. |
Df Residuals: | 47 | BIC: | 1063. |
Df Model: | 2 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 4.698e+04 | 2689.933 | 17.464 | 0.000 | 4.16e+04 | 5.24e+04 |
x1 | 0.7966 | 0.041 | 19.266 | 0.000 | 0.713 | 0.880 |
x2 | 0.0299 | 0.016 | 1.927 | 0.060 | -0.001 | 0.061 |
Omnibus: | 14.677 | Durbin-Watson: | 1.257 |
---|---|---|---|
Prob(Omnibus): | 0.001 | Jarque-Bera (JB): | 21.161 |
Skew: | -0.939 | Prob(JB): | 2.54e-05 |
Kurtosis: | 5.575 | Cond. No. | 5.32e+05 |
print(fm[0:3,[0,3]])
[[1 165349.2] [1 162597.7] [1 153441.51]]
fm_opt = fm[:, [0, 3]]
fm_opt = fm_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = v, exog = fm_opt).fit()
regressor_OLS.summary()
Dep. Variable: | y | R-squared: | 0.947 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.945 |
Method: | Least Squares | F-statistic: | 849.8 |
Date: | Wed, 27 Apr 2022 | Prob (F-statistic): | 3.50e-32 |
Time: | 18:30:29 | Log-Likelihood: | -527.44 |
No. Observations: | 50 | AIC: | 1059. |
Df Residuals: | 48 | BIC: | 1063. |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 4.903e+04 | 2537.897 | 19.320 | 0.000 | 4.39e+04 | 5.41e+04 |
x1 | 0.8543 | 0.029 | 29.151 | 0.000 | 0.795 | 0.913 |
Omnibus: | 13.727 | Durbin-Watson: | 1.116 |
---|---|---|---|
Prob(Omnibus): | 0.001 | Jarque-Bera (JB): | 18.536 |
Skew: | -0.911 | Prob(JB): | 9.44e-05 |
Kurtosis: | 5.361 | Cond. No. | 1.65e+05 |
print(fm[0:3,[0,3]])
[[1 165349.2] [1 162597.7] [1 153441.51]]
fm_b_elim_train,fm_b_elim_test,v_b_elim_train,v_b_elim_test = train_test_split(fm[:,3].reshape(len(fm[:,3]),1),
v,test_size=test_size,random_state=random_state)
b_elim_reg = LinearRegression()
b_elim_reg.fit(fm_b_elim_train,v_b_elim_train)
LinearRegression()
v_b_elim_pred = b_elim_reg.predict(fm_b_elim_test)
fm = data.iloc[:,:-1].values
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[3])],remainder='passthrough')
fm = ct.fit_transform(fm)
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
regressor = LinearRegression()
sfs1 = sfs(regressor, k_features=2, forward=True, verbose=2, scoring= 'neg_mean_squared_error')
sfs1 = sfs1.fit(fm,v)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s remaining: 0.0s [Parallel(n_jobs=1)]: Done 6 out of 6 | elapsed: 0.0s finished [2022-04-27 18:30:29] Features: 1/2 -- score: -99665424.82352272[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.0s remaining: 0.0s [Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 0.0s finished [2022-04-27 18:30:29] Features: 2/2 -- score: -94580206.64311817
data.head()
R&D Spend | Administration | Marketing Spend | State | Profit | |
---|---|---|---|---|---|
0 | 165349.20 | 136897.80 | 471784.10 | New York | 192261.83 |
1 | 162597.70 | 151377.59 | 443898.53 | California | 191792.06 |
2 | 153441.51 | 101145.55 | 407934.54 | Florida | 191050.39 |
3 | 144372.41 | 118671.85 | 383199.62 | New York | 182901.99 |
4 | 142107.34 | 91391.77 | 366168.42 | Florida | 166187.94 |
feat_vals = [int(i) for i in list(sfs1.k_feature_names_)]
print(fm[:,feat_vals][0:3,:])
[[165349.2 471784.1] [162597.7 443898.53] [153441.51 407934.54]]
fm_f_sel_train,fm_f_sel_test,v_f_sel_train,v_f_sel_test = train_test_split(fm[:,feat_vals],v,test_size=test_size,random_state=random_state)
from sklearn.linear_model import LinearRegression
f_sel_reg = LinearRegression()
f_sel_reg.fit(fm_f_sel_train,v_f_sel_train)
LinearRegression()
v_f_sel_pred = f_sel_reg.predict(fm_f_sel_test)
index = range(1,len(v_test)+1)
plt.scatter(index,v_test,color='blue')
plt.scatter(index,v_pred,color='red')
plt.title('Actual/Predicted Profit for 10 Startups')
plt.xlabel('Company Index')
plt.ylabel('Actual/Predicted Profit')
plt.show()
index = range(1,len(v_b_elim_test)+1)
plt.scatter(index,v_b_elim_test,color='blue')
plt.scatter(index,v_b_elim_pred,color='red')
plt.title('Actual/Predicted Profit for 10 Startups')
plt.xlabel('Company Index')
plt.ylabel('Actual/Predicted Profit')
plt.show()
index = range(1,len(v_f_sel_test)+1)
plt.scatter(index,v_f_sel_test,color='blue')
plt.scatter(index,v_f_sel_pred,color='red')
plt.title('Actual/Predicted Profit for 10 Startups')
plt.xlabel('Company Index')
plt.ylabel('Actual/Predicted Profit')
plt.show()
np.set_printoptions(precision=0)
print(np.concatenate((v_test.reshape(len(v_test),1),v_pred.reshape(len(v_pred),1)),1))
[[ 89949. 88857.] [108734. 109300.] [ 65200. 66680.] [ 71498. 71093.] [ 42560. 48589.] [118474. 116162.] [182902. 171322.] [ 99938. 99971.] [155753. 159258.] [156123. 158377.] [ 81006. 83684.] [191050. 179967.] [ 78240. 75512.]]
np.set_printoptions(precision=0)
print(np.concatenate((v_b_elim_test.reshape(len(v_b_elim_test),1),v_b_elim_pred.reshape(len(v_b_elim_pred),1)),1))
[[ 89949. 86814.] [108734. 106482.] [ 65200. 68462.] [ 71498. 69688.] [ 42560. 49870.] [118474. 113794.] [182902. 170897.] [ 99938. 101840.] [155753. 159099.] [156123. 162718.] [ 81006. 82194.] [191050. 178500.] [ 78240. 73975.]]
np.set_printoptions(precision=0)
print(np.concatenate((v_f_sel_test.reshape(len(v_f_sel_test),1),v_f_sel_pred.reshape(len(v_f_sel_pred),1)),1))
[[ 89949. 87854.] [108734. 108995.] [ 65200. 66502.] [ 71498. 70515.] [ 42560. 48272.] [118474. 115752.] [182902. 171685.] [ 99938. 99515.] [155753. 159130.] [156123. 157868.] [ 81006. 82966.] [191050. 179453.] [ 78240. 75154.]]
from sklearn.metrics import r2_score
print('{score:1.6f}'.format(score=r2_score(v_test,v_pred)))
0.987405
print('{score:1.6f}'.format(score=r2_score(v_b_elim_test,v_b_elim_pred)))
0.981786
print('{score:1.6f}'.format(score=r2_score(v_f_sel_test,v_f_sel_pred)))
0.987368
obs = [[0.2,1,0.964962,0.96104,0.964644],
[0.2,2,0.978326,0.978148,0.981418],
[0.2,3,0.944206,0.95655,0.961288],
[0.2,4,0.956036,0.961321,0.967485],
[0.2,5,0.966976,0.972811,0.96836],
[0.25,6,0.917621,0.914028,0.921129],
[0.25,7,0.925066,0.936618,0.939721],
[0.25,8,0.899269,0.902233,0.909121],
[0.25,9,0.888097,0.896608,0.901835],
[0.25,10,0.987405,0.981786,0.987368]]
r2_score_metrics = pd.DataFrame(data = obs, columns= ['TEST SIZE','RANDOM_STATE','SCIKIT ACCURACY','R&D ONLY','R&D AND MARKETING'])
print('\n',r2_score_metrics,'\n')
print('\nAverage: 0.9427964,0.9461143,0.9502369')
TEST SIZE RANDOM_STATE SCIKIT ACCURACY R&D ONLY R&D AND MARKETING 0 0.20 1 0.964962 0.961040 0.964644 1 0.20 2 0.978326 0.978148 0.981418 2 0.20 3 0.944206 0.956550 0.961288 3 0.20 4 0.956036 0.961321 0.967485 4 0.20 5 0.966976 0.972811 0.968360 5 0.25 6 0.917621 0.914028 0.921129 6 0.25 7 0.925066 0.936618 0.939721 7 0.25 8 0.899269 0.902233 0.909121 8 0.25 9 0.888097 0.896608 0.901835 9 0.25 10 0.987405 0.981786 0.987368 Average: 0.9427964,0.9461143,0.9502369