import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


data = pd.read_csv('50_Startups.csv')


data.head()


fm = data.iloc[:,:-1].values
v = data.iloc[:,-1].values


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[3])],remainder='passthrough')
fm = ct.fit_transform(fm)


test_size = 0.25
random_state = 10


from sklearn.model_selection import train_test_split
fm_train,fm_test,v_train,v_test = train_test_split(fm,v,test_size=test_size,random_state=random_state)


plt.scatter(fm_train[:,3],v_train,color='red')
plt.title('Profit vs R&D Spend')
plt.xlabel('R&D Spend')
plt.ylabel('Profit')
plt.show()


plt.scatter(fm_train[:,4],v_train,color='red')
plt.title('Profit vs Admin Spend')
plt.xlabel('Admin Spend')
plt.ylabel('Profit')
plt.show()


plt.scatter(fm_train[:,5],v_train,color='red')
plt.title('Profit vs Marketing Spend')
plt.xlabel('Marketing Spend')
plt.ylabel('Profit')
plt.show()


from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(fm_train,v_train)

LinearRegression()


v_pred = reg.predict(fm_test)


fm = fm[:,1:]


import statsmodels.api as sm
fm = np.append(arr = np.ones((50,1)).astype(int), values = fm, axis = 1)


print(fm[0:3,[0,1,2,3,4,5]])

[[1 0.0 1.0 165349.2 136897.8 471784.1]
 [1 0.0 0.0 162597.7 151377.59 443898.53]
 [1 1.0 0.0 153441.51 101145.55 407934.54]]


fm_opt = fm[:, [0, 1, 2, 3, 4, 5]]
fm_opt = fm_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = v, exog = fm_opt).fit()
regressor_OLS.summary()


print(fm[0:3,[0,1,3,4,5]])

[[1 0.0 165349.2 136897.8 471784.1]
 [1 0.0 162597.7 151377.59 443898.53]
 [1 1.0 153441.51 101145.55 407934.54]]


fm_opt = fm[:, [0, 1, 3, 4, 5]]
fm_opt = fm_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = v, exog = fm_opt).fit()
regressor_OLS.summary()


ny_profits = [v[i] for i in range(len(fm[:,2])) if fm[i,2] == 1.0]
cali_profits = [v[i] for i in range(len(fm[:,0])) if fm[i,0] == 1.0]
florida_profits = [v[i] for i in range(len(fm[:,1])) if fm[i,1] == 1.0]


plt.boxplot([ny_profits,cali_profits,florida_profits])
plt.title('Statewise Profit Distribution')
plt.xlabel('New York-California-Florida')
plt.ylabel('Profit Distribution')
plt.show()


len(ny_profits),len(cali_profits),len(florida_profits)

(17, 50, 16)


print(fm[0:3,[0,3,4,5]])

[[1 165349.2 136897.8 471784.1]
 [1 162597.7 151377.59 443898.53]
 [1 153441.51 101145.55 407934.54]]


fm_opt = fm[:, [0, 3, 4, 5]]
fm_opt = fm_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = v, exog = fm_opt).fit()
regressor_OLS.summary()


print(fm[0:3,[0,3,5]])

[[1 165349.2 471784.1]
 [1 162597.7 443898.53]
 [1 153441.51 407934.54]]


fm_opt = fm[:, [0, 3, 5]]
fm_opt = fm_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = v, exog = fm_opt).fit()
regressor_OLS.summary()


print(fm[0:3,[0,3]])

[[1 165349.2]
 [1 162597.7]
 [1 153441.51]]


fm_opt = fm[:, [0, 3]]
fm_opt = fm_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = v, exog = fm_opt).fit()
regressor_OLS.summary()


print(fm[0:3,[0,3]])

[[1 165349.2]
 [1 162597.7]
 [1 153441.51]]


fm_b_elim_train,fm_b_elim_test,v_b_elim_train,v_b_elim_test = train_test_split(fm[:,3].reshape(len(fm[:,3]),1),
                                                                               v,test_size=test_size,random_state=random_state)


b_elim_reg = LinearRegression()
b_elim_reg.fit(fm_b_elim_train,v_b_elim_train)

LinearRegression()


v_b_elim_pred = b_elim_reg.predict(fm_b_elim_test)


fm = data.iloc[:,:-1].values
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[3])],remainder='passthrough')
fm = ct.fit_transform(fm)


from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
regressor = LinearRegression()
sfs1 = sfs(regressor, k_features=2, forward=True, verbose=2, scoring= 'neg_mean_squared_error')
sfs1 = sfs1.fit(fm,v)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s finished

[2022-04-27 18:30:29] Features: 1/2 -- score: -99665424.82352272[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished

[2022-04-27 18:30:29] Features: 2/2 -- score: -94580206.64311817


data.head()


feat_vals = [int(i) for i in list(sfs1.k_feature_names_)]
print(fm[:,feat_vals][0:3,:])

[[165349.2 471784.1]
 [162597.7 443898.53]
 [153441.51 407934.54]]


fm_f_sel_train,fm_f_sel_test,v_f_sel_train,v_f_sel_test = train_test_split(fm[:,feat_vals],v,test_size=test_size,random_state=random_state)


from sklearn.linear_model import LinearRegression
f_sel_reg = LinearRegression()
f_sel_reg.fit(fm_f_sel_train,v_f_sel_train)

LinearRegression()


v_f_sel_pred = f_sel_reg.predict(fm_f_sel_test)


index = range(1,len(v_test)+1)
plt.scatter(index,v_test,color='blue')
plt.scatter(index,v_pred,color='red')
plt.title('Actual/Predicted Profit for 10 Startups')
plt.xlabel('Company Index')
plt.ylabel('Actual/Predicted Profit')
plt.show()


index = range(1,len(v_b_elim_test)+1)
plt.scatter(index,v_b_elim_test,color='blue')
plt.scatter(index,v_b_elim_pred,color='red')
plt.title('Actual/Predicted Profit for 10 Startups')
plt.xlabel('Company Index')
plt.ylabel('Actual/Predicted Profit')
plt.show()


index = range(1,len(v_f_sel_test)+1)
plt.scatter(index,v_f_sel_test,color='blue')
plt.scatter(index,v_f_sel_pred,color='red')
plt.title('Actual/Predicted Profit for 10 Startups')
plt.xlabel('Company Index')
plt.ylabel('Actual/Predicted Profit')
plt.show()


np.set_printoptions(precision=0)
print(np.concatenate((v_test.reshape(len(v_test),1),v_pred.reshape(len(v_pred),1)),1))

[[ 89949.  88857.]
 [108734. 109300.]
 [ 65200.  66680.]
 [ 71498.  71093.]
 [ 42560.  48589.]
 [118474. 116162.]
 [182902. 171322.]
 [ 99938.  99971.]
 [155753. 159258.]
 [156123. 158377.]
 [ 81006.  83684.]
 [191050. 179967.]
 [ 78240.  75512.]]


np.set_printoptions(precision=0)
print(np.concatenate((v_b_elim_test.reshape(len(v_b_elim_test),1),v_b_elim_pred.reshape(len(v_b_elim_pred),1)),1))

[[ 89949.  86814.]
 [108734. 106482.]
 [ 65200.  68462.]
 [ 71498.  69688.]
 [ 42560.  49870.]
 [118474. 113794.]
 [182902. 170897.]
 [ 99938. 101840.]
 [155753. 159099.]
 [156123. 162718.]
 [ 81006.  82194.]
 [191050. 178500.]
 [ 78240.  73975.]]


np.set_printoptions(precision=0)
print(np.concatenate((v_f_sel_test.reshape(len(v_f_sel_test),1),v_f_sel_pred.reshape(len(v_f_sel_pred),1)),1))

[[ 89949.  87854.]
 [108734. 108995.]
 [ 65200.  66502.]
 [ 71498.  70515.]
 [ 42560.  48272.]
 [118474. 115752.]
 [182902. 171685.]
 [ 99938.  99515.]
 [155753. 159130.]
 [156123. 157868.]
 [ 81006.  82966.]
 [191050. 179453.]
 [ 78240.  75154.]]


from sklearn.metrics import r2_score
print('{score:1.6f}'.format(score=r2_score(v_test,v_pred)))

0.987405


print('{score:1.6f}'.format(score=r2_score(v_b_elim_test,v_b_elim_pred)))

0.981786


print('{score:1.6f}'.format(score=r2_score(v_f_sel_test,v_f_sel_pred)))

0.987368


obs = [[0.2,1,0.964962,0.96104,0.964644],
[0.2,2,0.978326,0.978148,0.981418],
[0.2,3,0.944206,0.95655,0.961288],
[0.2,4,0.956036,0.961321,0.967485],
[0.2,5,0.966976,0.972811,0.96836],
[0.25,6,0.917621,0.914028,0.921129],
[0.25,7,0.925066,0.936618,0.939721],
[0.25,8,0.899269,0.902233,0.909121],
[0.25,9,0.888097,0.896608,0.901835],
[0.25,10,0.987405,0.981786,0.987368]]


r2_score_metrics = pd.DataFrame(data = obs, columns= ['TEST SIZE','RANDOM_STATE','SCIKIT ACCURACY','R&D ONLY','R&D AND MARKETING'])
print('\n',r2_score_metrics,'\n')
print('\nAverage: 0.9427964,0.9461143,0.9502369')

    TEST SIZE  RANDOM_STATE  SCIKIT ACCURACY  R&D ONLY  R&D AND MARKETING
0       0.20             1         0.964962  0.961040           0.964644
1       0.20             2         0.978326  0.978148           0.981418
2       0.20             3         0.944206  0.956550           0.961288
3       0.20             4         0.956036  0.961321           0.967485
4       0.20             5         0.966976  0.972811           0.968360
5       0.25             6         0.917621  0.914028           0.921129
6       0.25             7         0.925066  0.936618           0.939721
7       0.25             8         0.899269  0.902233           0.909121
8       0.25             9         0.888097  0.896608           0.901835
9       0.25            10         0.987405  0.981786           0.987368 


Average: 0.9427964,0.9461143,0.9502369

	R&D Spend	Administration	Marketing Spend	State	Profit
0	165349.20	136897.80	471784.10	New York	192261.83
1	162597.70	151377.59	443898.53	California	191792.06
2	153441.51	101145.55	407934.54	Florida	191050.39
3	144372.41	118671.85	383199.62	New York	182901.99
4	142107.34	91391.77	366168.42	Florida	166187.94

Dep. Variable:	y	R-squared:	0.951
Model:	OLS	Adj. R-squared:	0.945
Method:	Least Squares	F-statistic:	169.9
Date:	Wed, 27 Apr 2022	Prob (F-statistic):	1.34e-27
Time:	18:30:28	Log-Likelihood:	-525.38
No. Observations:	50	AIC:	1063.
Df Residuals:	44	BIC:	1074.
Df Model:	5
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	5.013e+04	6884.820	7.281	0.000	3.62e+04	6.4e+04
x1	198.7888	3371.007	0.059	0.953	-6595.030	6992.607
x2	-41.8870	3256.039	-0.013	0.990	-6604.003	6520.229
x3	0.8060	0.046	17.369	0.000	0.712	0.900
x4	-0.0270	0.052	-0.517	0.608	-0.132	0.078
x5	0.0270	0.017	1.574	0.123	-0.008	0.062

Omnibus:	14.782	Durbin-Watson:	1.283
Prob(Omnibus):	0.001	Jarque-Bera (JB):	21.266
Skew:	-0.948	Prob(JB):	2.41e-05
Kurtosis:	5.572	Cond. No.	1.45e+06

Dep. Variable:	y	R-squared:	0.951
Model:	OLS	Adj. R-squared:	0.946
Method:	Least Squares	F-statistic:	217.2
Date:	Wed, 27 Apr 2022	Prob (F-statistic):	8.49e-29
Time:	18:30:28	Log-Likelihood:	-525.38
No. Observations:	50	AIC:	1061.
Df Residuals:	45	BIC:	1070.
Df Model:	4
Covariance Type:	nonrobust

Omnibus:	14.758	Durbin-Watson:	1.282
Prob(Omnibus):	0.001	Jarque-Bera (JB):	21.172
Skew:	-0.948	Prob(JB):	2.53e-05
Kurtosis:	5.563	Cond. No.	1.40e+06

Omnibus:	14.838	Durbin-Watson:	1.282
Prob(Omnibus):	0.001	Jarque-Bera (JB):	21.442
Skew:	-0.949	Prob(JB):	2.21e-05
Kurtosis:	5.586	Cond. No.	1.40e+06

Dep. Variable:	y	R-squared:	0.950
Model:	OLS	Adj. R-squared:	0.948
Method:	Least Squares	F-statistic:	450.8
Date:	Wed, 27 Apr 2022	Prob (F-statistic):	2.16e-31
Time:	18:30:29	Log-Likelihood:	-525.54
No. Observations:	50	AIC:	1057.
Df Residuals:	47	BIC:	1063.
Df Model:	2
Covariance Type:	nonrobust

Omnibus:	14.677	Durbin-Watson:	1.257
Prob(Omnibus):	0.001	Jarque-Bera (JB):	21.161
Skew:	-0.939	Prob(JB):	2.54e-05
Kurtosis:	5.575	Cond. No.	5.32e+05

Dep. Variable:	y	R-squared:	0.947
Model:	OLS	Adj. R-squared:	0.945
Method:	Least Squares	F-statistic:	849.8
Date:	Wed, 27 Apr 2022	Prob (F-statistic):	3.50e-32
Time:	18:30:29	Log-Likelihood:	-527.44
No. Observations:	50	AIC:	1059.
Df Residuals:	48	BIC:	1063.
Df Model:	1
Covariance Type:	nonrobust

Omnibus:	13.727	Durbin-Watson:	1.116
Prob(Omnibus):	0.001	Jarque-Bera (JB):	18.536
Skew:	-0.911	Prob(JB):	9.44e-05
Kurtosis:	5.361	Cond. No.	1.65e+05

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	5.011e+04	6647.870	7.537	0.000	3.67e+04	6.35e+04
x1	220.1585	2900.536	0.076	0.940	-5621.821	6062.138
x2	0.8060	0.046	17.606	0.000	0.714	0.898
x3	-0.0270	0.052	-0.523	0.604	-0.131	0.077
x4	0.0270	0.017	1.592	0.118	-0.007	0.061

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	5.012e+04	6572.353	7.626	0.000	3.69e+04	6.34e+04
x1	0.8057	0.045	17.846	0.000	0.715	0.897
x2	-0.0268	0.051	-0.526	0.602	-0.130	0.076
x3	0.0272	0.016	1.655	0.105	-0.006	0.060

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	4.698e+04	2689.933	17.464	0.000	4.16e+04	5.24e+04
x1	0.7966	0.041	19.266	0.000	0.713	0.880
x2	0.0299	0.016	1.927	0.060	-0.001	0.061

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	4.903e+04	2537.897	19.320	0.000	4.39e+04	5.41e+04
x1	0.8543	0.029	29.151	0.000	0.795	0.913

Feature Selection Analysis using Stepwise Regression models¶

Data Preparation¶

Importing Libraries¶

read CSV File¶

preview of the contents of the dataframe¶

fm (feature matrix) -- collection of independent variables -- R&D, Admin, Marketing Spend and State¶

v (variable) -- dependent variable -- Profit¶

encode the 'State' attribute into numbers¶

split of feature matrix into training and test set¶

//based on predefined test_size and random_state for all models//¶

training set -- for the machine to learn from¶

test set -- to test machine's profit prediction¶

General Trends¶

profit versus expenditure on R&D (sampled from training set)¶

profit versus expenditure on administration (sampled from training set)¶

profit versus expenditure on marketing (sampled from training set)¶

Analysis 1:¶

>A Strong linear relationship exists between a Startup's Profits and their R&D Spend.¶

> Profit dependence on Admin Spend appears to be random.¶

>Profits portray a positive trend with increasing Marketing Spend.¶

Data Processing¶

Step 1: allow scikit-learn to extract stastically significant features and predict profits¶

invocation of the linear regressor and fitting to the training set¶

machine's prediction assigned to 'v_pred'¶

Step 2: manually implement Backward Selection¶

remove first column of dataframe (avoid dummy variable trap)¶

append column vector of 1's at the beginning of matrix for interpretation of constant term in Mutiple LR¶

Ordinary Least Square Regression Results display p-values stagewise¶

Initial Matrix preview¶

Initially, all the features are passed¶

feature x2 (dummy variable) is removed, as it has highest relative p-value, and model is re-fit¶

Matrix Preview after feature x2 is removed¶

feature x1 (dummy variable) is removed, as it has highest relative p-value, and model is re-fit¶

Analysis 2:¶

>Both dummy variables passed turn out to be statistically insignificant in Profit prediction.¶

>This indicates that the Statwise distribution of Profits is likely to be similar¶

>A Boxplot visual on the same illustrates this¶

Backward Elimination continued¶

Matrix Preview after feature x1 is removed¶

feature x2 (Admin Spend) is removed, as it has highest relative p-value, and model is re-fit¶

Matrix Preview after feature x2 is removed¶

feature x2 (Marketing Spend) is removed, as it has highest relative p-value, and model is re-fit¶

Matrix Preview after feature x2 is removed¶

Backward Elimination is concluded, as all p-values of remaining features are below 0.05¶

Final Matrix of Features (preview) contain a constant term and R&D Spend¶

Analysis 3:¶

> Since only the R&D Spend feature remains, (intercept term held aside), the 'General Trend Graphs' intuition appears negotiable.¶

> This leads to the fact that Profit varying only as a function of R&D Spend (Simple Linear Regression) is a good approximation to the model (later confirmed by r2_scores).¶

Implement model using backward elimination optimised feature matrix¶

Step 3: Implement model using Forward Selection¶

reassign feature matrix to original¶

Identify two most statistically significant features using Forward Selection and verify results from Backward Elimination¶

Analysis 4:¶

feat_vals -- values of those features identified as statistically significant by forward selection¶

>By checking, it is confirmed that the forward selection algorithm (when forced to extract two of the most statistically significant features) selects R&D Spend and Marketing Spend.¶

>This result is in agreement to the penultimate observation of Backward Selection.¶

Now, implement the model as per features given by Forward Selection¶

Data Visualization¶

Colour Code¶

-- Red point indicates machine's predicted profit¶

-- Blue point indicates actual profit¶

visual on actual versus predicted profit (evaluated by sklearn)¶

visual on actual versus predicted profit (evaluated by manual backward elimination)¶

visual on actual versus predicted profit (evaluated by manual forward selection)¶

quantitative data on the actual profit versus the predicted profit (evaluated by sklearn)¶

quantitative data on the actual profit versus the predicted profit (evaluated by manual backward elimination)¶

quantitative data on the actual profit versus the predicted profit (evaluated by manual forward selection)¶

Comparison of r2_scores (an accuracy measure of regression models graded 0 to 1)¶

scikit learn prediction¶

backward elimination's prediction (only R&D Spend accounted)¶

feature selection's prediction (both R&D spend and Marketing Spend accounted)¶

Analysis 5:¶

> When a variety of random states are used to run the model, there appears a generalization that features 'R&D' and 'Marketing' when used, consistently outperform other two.¶