Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions koder.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"name":"python","version":"3.10.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"IMPORTING NECESSARY MODULES","metadata":{}},{"cell_type":"code","source":"import pandas as pd\nimport numpy as np\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import mean_squared_error, r2_score\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.experimental import enable_iterative_imputer\nfrom sklearn.impute import IterativeImputer\nimport seaborn as sns\nimport matplotlib.pyplot as plt","metadata":{"execution":{"iopub.status.busy":"2023-10-26T16:40:16.001800Z","iopub.execute_input":"2023-10-26T16:40:16.003728Z","iopub.status.idle":"2023-10-26T16:40:16.013156Z","shell.execute_reply.started":"2023-10-26T16:40:16.003669Z","shell.execute_reply":"2023-10-26T16:40:16.011744Z"},"trusted":true},"execution_count":7,"outputs":[]},{"cell_type":"markdown","source":"/kaggle/input/machine-learning-101/ML101_train_dataset.csvReading csv using pandas files and preprocessing the given data to remove nan values and inconsistent(negative) values.","metadata":{}},{"cell_type":"code","source":"df = pd.read_csv('/kaggle/input/machine-learning-101/ML101_train_dataset.csv')\n\nnan_counts = df.isna().sum(axis=1)\ntotal_columns = len(df.columns)-1\n\nthreshold = 1/3\n\ndf = df.drop(nan_counts[nan_counts>=total_columns*threshold].index,axis=0)\ndef putNaN(x):\n if x < 0:\n return np.nan\n else:\n return x\n\nnumeric_columns = []\nfor i in df.columns:\n if(i==\"Gender\" or i==\"LifeStyle\"):\n continue\n numeric_columns.append(i)\n\n\ndf[numeric_columns] = df[numeric_columns].applymap(putNaN)\ndf.dropna(subset=['Gender'], how='all', inplace=True)\ndf.dropna(subset=['LifeStyle'], how='all', inplace=True)\n\nvalid_lifestyles = ['Good', 'Bad', 'Average', 'Great']\ndf = df[df['LifeStyle'].isin(valid_lifestyles)]\n\ndf[numeric_columns] = df[numeric_columns].interpolate(method='linear', axis=0)\n\ntrain_data=df.copy() # preprocessed training data to be used for classification","metadata":{"execution":{"iopub.status.busy":"2023-10-26T16:40:16.026528Z","iopub.execute_input":"2023-10-26T16:40:16.027066Z","iopub.status.idle":"2023-10-26T16:40:17.365961Z","shell.execute_reply.started":"2023-10-26T16:40:16.027020Z","shell.execute_reply":"2023-10-26T16:40:17.364333Z"},"trusted":true},"execution_count":8,"outputs":[]},{"cell_type":"markdown","source":"Code for prediction of blood pressure (systolic and diastolic) using regression","metadata":{}},{"cell_type":"code","source":"\ntarget_variable1,target_variable2 = 'Systolic BP' , 'Diastolic BP'\nfeatures = numeric_columns.copy()\n\n\nfeatures.remove(target_variable1)\nfeatures.remove(target_variable2)\n\ncategorical_column = ['Gender']\ndf['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})\nfeatures.append('Gender')\nX_train1,X_test1,y_train1,y_test1 = train_test_split(df[features],df[target_variable1],test_size=0.00001,random_state=42)\nX_train1 = np.array(X_train1)\nX_test1 = np.array(X_test1)\ny_train1 = np.array(y_train1)\ny_test1 = np.array(y_test1)\ndef regression_parameters(x,y):\n Z = []\n\n for i in range(len(x)):\n Z.append(x[i])\n \n w = np.dot((np.dot(np.linalg.inv(np.dot(np.transpose(Z),Z)),np.transpose(Z))),np.transpose(y))\n\n return w\n\ndef multiple_regression(x_train,y_train,x_test):\n w = regression_parameters(x_train,y_train)\n y_predict = []\n for i in range(len(x_test)):\n y_predict.append(np.dot(w,np.transpose(x_test[i])))\n \n return y_predict\n\nX_train2,X_test2,y_train2,y_test2 = train_test_split(df[features],df[target_variable2],test_size=0.00001,random_state=42)\nX_train2 = np.array(X_train2)\nX_test2 = np.array(X_test2)\ny_train2 = np.array(y_train2)\ny_test2 = np.array(y_test2)\ndef regression_parameters(x,y):\n Z = []\n\n for i in range(len(x)):\n Z.append(x[i])\n \n w = np.dot((np.dot(np.linalg.inv(np.dot(np.transpose(Z),Z)),np.transpose(Z))),np.transpose(y))\n\n return w\n\ndef multiple_regression(x_train,y_train,x_test):\n w = regression_parameters(x_train,y_train)\n y_predict = []\n for i in range(len(x_test)):\n y_predict.append(np.dot(w,np.transpose(x_test[i])))\n \n return y_predict\n\ntest_data = pd.read_csv('/kaggle/input/machine-learning-101/ML101_dataset_test_feature.csv')\nnumeric_columns.remove('Systolic BP')\nnumeric_columns.remove('Diastolic BP')\ntest_data[numeric_columns] = test_data[numeric_columns].applymap(putNaN)\ntest_data.dropna(subset=['Gender'], how='all', inplace=True)\n# test_data.dropna(subset=['LifeStyle'], how='all', inplace=True)\ntest_data['Gender'] = test_data['Gender'].map({'Male': 0, 'Female': 1})\n\nX_test = test_data[features].values\n\ny_predict_systolic = multiple_regression(X_train1, y_train1, X_test)\n\n\ny_predict_diastolic = multiple_regression(X_train2, y_train2, X_test)\n\nresults = pd.DataFrame({'Systolic BP': y_predict_systolic, 'Diastolic BP': y_predict_diastolic})\n\n\nresults.to_csv('predicted_blood_pressure.csv', index=False)","metadata":{"execution":{"iopub.status.busy":"2023-10-26T16:40:17.369268Z","iopub.execute_input":"2023-10-26T16:40:17.370942Z","iopub.status.idle":"2023-10-26T16:40:18.449369Z","shell.execute_reply.started":"2023-10-26T16:40:17.370869Z","shell.execute_reply":"2023-10-26T16:40:18.448223Z"},"trusted":true},"execution_count":9,"outputs":[]},{"cell_type":"markdown","source":"Code for Classification of given test data in terms of lifestyle","metadata":{}},{"cell_type":"code","source":"attr_name=[\"Average Daily Steps\",\"Hours of Sleep\",\"Caloric Intake\",\"Age\",\"Gender\",\"Height\",\"Weight\",\"Cholesterol level\",\"Blood Sugar level\"]\n# train_data=pd.read_csv(\"preprocessed_data.csv\")\ntrain_data=train_data.drop([\"Systolic BP\",\"Diastolic BP\"], axis=1)\n\ngr_by_gender=train_data.groupby(\"Gender\")\nmale_data=gr_by_gender.get_group(\"Male\")\nfemale_data=gr_by_gender.get_group(\"Female\")\n\ntest_data=pd.read_csv(\"/kaggle/input/machine-learning-101/ML101_dataset_test_feature.csv\")\n# test_data=x_test\ntest_data_wot_gender=test_data.drop([\"Gender\"], axis=1)\n\nattr_name=[\"Average Daily Steps\",\"Hours of Sleep\",\"Caloric Intake\",\"Age\",\"Gender\",\"Height\",\"Weight\",\"Cholesterol level\",\"Blood Sugar level\"]\nn=len(attr_name)\nclass_name=[\"Bad\",\"Average\",\"Good\",\"Great\"]\n\n# CLASSIFICATION FOR MALE\n\nmale_data=male_data.drop([\"Gender\"], axis=1)\ny_male=male_data.iloc[:,8:]\nm_data_sz=len(y_male)\n\n# separating each class data\n\ngrouped_data = male_data.groupby(\"LifeStyle\")\nm_c1=grouped_data.get_group(\"Bad\").iloc[:,:8]\nm_c2=grouped_data.get_group(\"Average\").iloc[:,:8]\nm_c3=grouped_data.get_group(\"Good\").iloc[:,:8]\nm_c4=grouped_data.get_group(\"Great\").iloc[:,:8]\n\n# parameter estimation for each class of male\nclass_c=[m_c1,m_c2,m_c3,m_c4]\nmale_mean_estimated_matrix=[]\nmale_cov_estimated_matrix=[]\n\nfor i in range(4):\n c_i=(class_c[i])\n mean_vec=np.array(c_i.mean(axis=0,numeric_only=True))\n male_mean_estimated_matrix.append(mean_vec)\n tmp=np.array(c_i).transpose()\n cov_vec=np.cov(tmp)\n male_cov_estimated_matrix.append(cov_vec)\n\nmale_prior_prob_c=[]\nmale_prior_prob_c.append(len(m_c1)/m_data_sz)\nmale_prior_prob_c.append(len(m_c2)/m_data_sz)\nmale_prior_prob_c.append(len(m_c3)/m_data_sz)\nmale_prior_prob_c.append(len(m_c4)/m_data_sz)\n\n# CLASSIFICATION FOR FEMALE\n\nfemale_data=female_data.drop([\"Gender\"], axis=1)\ny_female=female_data.iloc[:,8:]\nf_data_sz=len(y_female)\n\n# separating each class data\n\ngrouped_data = female_data.groupby(\"LifeStyle\")\nf_c1=grouped_data.get_group(\"Bad\").iloc[:,:8]\nf_c2=grouped_data.get_group(\"Average\").iloc[:,:8]\nf_c3=grouped_data.get_group(\"Good\").iloc[:,:8]\nf_c4=grouped_data.get_group(\"Great\").iloc[:,:8]\n\n# parameter estimation for each class of male\nclass_c=[f_c1,f_c2,f_c3,f_c4]\nfemale_mean_estimated_matrix=[]\nfemale_cov_estimated_matrix=[]\n\nfor i in range(4):\n c_i=(class_c[i])\n mean_vec=np.array(c_i.mean(axis=0,numeric_only=True))\n female_mean_estimated_matrix.append(mean_vec)\n tmp=np.array(c_i).transpose()\n cov_vec=np.cov(tmp)\n female_cov_estimated_matrix.append(cov_vec)\n\nfemale_prior_prob_c=[]\nfemale_prior_prob_c.append(len(f_c1)/f_data_sz)\nfemale_prior_prob_c.append(len(f_c2)/f_data_sz)\nfemale_prior_prob_c.append(len(f_c3)/f_data_sz)\nfemale_prior_prob_c.append(len(f_c4)/f_data_sz)\n\ny_test_output=[]\n\n# FUNCTION FOR MAHALNOBIS DISTANCE\ndef md(x,mu,sig):\n x1=np.array(x-mu)\n x2=np.linalg.inv(sig)\n a=np.matmul(x1.transpose(),x2)\n b=np.matmul(a,x1)\n return b\n\n# FUNCTION FOR CLASSIFICATION OF MALE TEST DATA\ndef male_classifier(x):\n prob_ci=[]\n for i in range(4):\n prob_ci.append(md(x,male_mean_estimated_matrix[i],male_cov_estimated_matrix[i]))\n ind=np.argmin(prob_ci)\n y_test_output.append(class_name[ind])\n\n# FUNCTION FOR CLASSIFICATION OF FEMALE TEST DATA\ndef female_classifier(x):\n prob_ci=[]\n for i in range(4):\n prob_ci.append(md(x,male_mean_estimated_matrix[i],male_cov_estimated_matrix[i]))\n ind=np.argmin(prob_ci)\n y_test_output.append(class_name[ind])\n\nfor i in range(len(test_data.index)):\n x_wt_g=list(np.array(test_data.iloc[i:i+1,:])[0])\n x=list(np.array(test_data_wot_gender.iloc[i:i+1,:])[0])\n if(x_wt_g[4]=='Male'):\n male_classifier(x)\n else:\n female_classifier(x)\n\nf_results = pd.DataFrame({'LifeStyle': y_test_output})\n\n\nf_results.to_csv('predicted_lifestyle.csv', index=False)","metadata":{"execution":{"iopub.status.busy":"2023-10-26T16:40:18.451089Z","iopub.execute_input":"2023-10-26T16:40:18.451995Z","iopub.status.idle":"2023-10-26T16:40:24.632724Z","shell.execute_reply.started":"2023-10-26T16:40:18.451921Z","shell.execute_reply":"2023-10-26T16:40:24.631321Z"},"trusted":true},"execution_count":10,"outputs":[]}]}
Loading