# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/098765432/Depressiontweetdata12.csv

import the libraries¶

# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import missingno as missing
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

import random
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve, validation_curve
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.metrics import roc_curve, roc_auc_score ,auc, plot_roc_curve
from sklearn import svm
import sklearn.metrics

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

Data reading¶

df = pd.read_csv("../input/098765432/Depressiontweetdata12.csv")

# preview the data
df.head()

Training Data Info¶

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1429 entries, 0 to 1428
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Survey_id              1429 non-null   int64  
 1   tweet                  1429 non-null   object 
 2   Gender                 1429 non-null   int64  
 3   Age                    1429 non-null   int64  
 4   Married                1429 non-null   int64  
 5   Number_children        1429 non-null   int64  
 6   education_level        1429 non-null   int64  
 7   total_members          1429 non-null   int64  
 8   gained_asset           1429 non-null   int64  
 9   durable_asset          1429 non-null   int64  
 10  save_asset             1429 non-null   int64  
 11  living_expenses        1429 non-null   int64  
 12  other_expenses         1429 non-null   int64  
 13  incoming_salary        1429 non-null   int64  
 14  incoming_own_farm      1429 non-null   int64  
 15  incoming_business      1429 non-null   int64  
 16  incoming_no_business   1429 non-null   int64  
 17  incoming_agricultural  1429 non-null   int64  
 18  farm_expenses          1429 non-null   int64  
 19  labor_primary          1429 non-null   int64  
 20  lasting_investment     1429 non-null   int64  
 21  no_lasting_investmen   1409 non-null   float64
 22  depressed              1429 non-null   int64  
 23  category               1429 non-null   object 
dtypes: float64(1), int64(21), object(2)
memory usage: 268.1+ KB

df.describe()

# lets check the no. of unique items present in the categorical column

df.select_dtypes('object').nunique()

tweet       75
category     5
dtype: int64

* # ** classification¶

# - Get the features and labels
features=df.loc[:,df.columns!='depressed'].values[:,2:]
labels=df.loc[:,'depressed'].values

# - Get the count of each label (0 and 1) in labels
print(labels[labels==1].shape[0], labels[labels==0].shape[0])

238 1191

df.groupby(["depressed"]).size().plot(kind="bar",fontsize=30)

<matplotlib.axes._subplots.AxesSubplot at 0x7efc90205cd0>

**depressed: [ Zero: No depressed] or [One: depressed] 0 = negitive 1= positive

# import the pyplot library

import matplotlib.pyplot as plotter

 

# The slice names of a population distribution pie chart

pieLabels              = 'Depressed','Non-Depressed'

 

# Population data

populationShare     = [16.65,83.34]

 

figureObject, axesObject = plotter.subplots()

 

# Draw the pie chart

axesObject.pie(populationShare,

        labels=pieLabels,

        autopct='%1.2f',

        startangle=130)

 

# Aspect ratio - equal means pie is a circle

axesObject.axis('equal')

 

plotter.show()

hist = df['Age'].hist()

sns.factorplot(x='depressed', col='Married', kind='count', data= df)

<seaborn.axisgrid.FacetGrid at 0x7efc8ffbc990>

plt.figure(figsize=(25,6))
plt.subplot(1, 4, 1)
sns.distplot(df['Age'])

plt.subplot(1, 4, 2)
sns.distplot(df['Number_children'])

plt.subplot(1, 4, 3)
sns.distplot(df['education_level'])

plt.subplot(1, 4, 4)
sns.distplot(df['no_lasting_investmen'])

plt.suptitle('Checking for Skewness', fontsize = 15)
plt.show()

depression classes withAGE,Gender,depression category wise with comparison¶

plt.figure(figsize=(20,15))

plt.subplot(3,3,1)
sns.countplot(x='Gender', hue='depressed', data=df)
plt.subplot(3,3,2)
sns.countplot(x='Age', hue='depressed', data=df)
plt.subplot(3,3,3)
sns.countplot(x='Number_children', hue='depressed', data=df)

plt.subplot(3,3,4)
sns.countplot(x='education_level', hue='depressed', data=df)
plt.subplot(3,3,5)
sns.countplot(x='category', hue='depressed', data=df)
plt.subplot(3,3,6)
sns.countplot(x='incoming_no_business', hue='depressed', data=df)

plt.tight_layout()
plt.show()

depressed vs all features¶

df.describe()

dfPairplot = df.drop(['save_asset','Survey_id' , 'category' , 'Gender' , 'Age' , 'Married' , 'Number_children' , 'education_level' , 'total_members' , 'living_expenses' , 'other_expenses' , 'incoming_salary' , 'incoming_own_farm' , 'incoming_business' , 'incoming_no_business' , 'incoming_agricultural' , 'farm_expenses' , 'labor_primary' , 'lasting_investment' , 'no_lasting_investmen'], axis=1)
dfPairplot.head()
plt.figure(figsize=(25,6))
sns.pairplot(data=dfPairplot,hue='depressed',plot_kws={'alpha':0.2})
plt.show()

<Figure size 1800x432 with 0 Axes>

age vs depressed¶

facet = sns.FacetGrid(df,hue="depressed", aspect=4)
facet.map(sns.kdeplot, 'Age', shade=True)
facet.set(xlim=(0,df['Age'].max()))
facet.add_legend()

plt.show()

Confusion -matrix of classes¶

dfCorr = df.drop(['no_lasting_investmen'], axis=1)

plt.subplots(figsize=(20,10)) 
sns.heatmap(dfCorr.corr(), annot = True, fmt = ".2f")
plt.show()

Modeling¶

(implement-SVM-Classifier)¶

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

train the model¶

from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(stop_words='english') 
cv = count_vectorizer.fit_transform(df['tweet'])
cv.shape

(1429, 157)

X_train,X_test,y_train,y_test = train_test_split(cv,df['depressed'] , test_size=.2,stratify=df['depressed'], random_state=42)

svc = svm.SVC()
svc.fit(X_train,y_train)
prediction_svc = svc.predict(X_test)
print(accuracy_score(prediction_svc,y_test))

0.8321678321678322

from sklearn.datasets import make_blobs
X, y = make_blobs(100, 2, centers=2, random_state=2, cluster_std=1.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu');

Naive bayes classifier¶

X_train,X_test,y_train,y_test = train_test_split(cv,df['depressed'] , test_size=.2,stratify=df['depressed'], random_state=42)

# Model training/fiting

f = feature_extraction.text.CountVectorizer(stop_words = 'english')
X = f.fit_transform(df["tweet"])
np.shape(X)

(1429, 157)

df["category"]=df["category"].map({'depresssed':1,'non-depressed':0})
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, df['category'], test_size=0.33, random_state=42)
print([np.shape(X_train), np.shape(X_test)])

[(957, 157), (472, 157)]

from sklearn.model_selection import train_test_split

#Se parten los datos para usar 70 Entrenamiento y 30 test:
X_train_bayes, X_test_bayes, y_train_bayes, y_test_bayes = train_test_split(X, y,
                                                    test_size=.3)

#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
model = GaussianNB()
model.fit(X_train_bayes, y_train_bayes)

GaussianNB()

from sklearn.metrics import precision_score, recall_score, confusion_matrix, f1_score, classification_report
y_pred_bayes = model.predict(X_test_bayes)

cm = confusion_matrix(y_test_bayes, y_pred_bayes)
print(cm)
# Print the precision and recall, among other metrics
print(classification_report(y_test_bayes, y_pred_bayes, digits=2))

[[16  0]
 [ 0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      1.00      1.00        14

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

Comparison of ML classifier( accuracy )¶

import numpy as np 
import matplotlib.pyplot as plt  
  
   
# creating the dataset 
data = {'SVM-classifier':0.83, 'Naive bayes classifier':1.00} 
courses = list(data.keys()) 
values = list(data.values()) 
   
fig = plt.figure(figsize = (10, 5)) 
  
# creating the bar plot 
plt.bar(courses, values, color ='maroon',  
        width = 0.4) 
  
plt.xlabel("machine learning classifier") 
plt.ylabel("Accuracy score") 
plt.title("compare the all classifier algorithms") 
plt.show()

	Survey_id	tweet	Gender	Age	Married	Number_children	education_level	total_members	gained_asset	durable_asset	...	incoming_own_farm	incoming_no_business	incoming_agricultural	farm_expenses	labor_primary	lasting_investment	no_lasting_investmen	depressed	category
0	926	damn taking this personality quiz and realizin...	1	28	1	4	10	5	28912201	22861940	...	0	0	30028818	31363432	0	28411718	28292707.0	0	major
1	747	gohan in general needs more love lol will neve...	1	23	1	3	8	5	28912201	22861940	...	0	0	30028818	31363432	0	28411718	28292707.0	1	major
2	1190	damn taking this personality quiz and realizin...	1	22	1	3	9	5	28912201	22861940	...	0	0	30028818	31363432	0	28411718	28292707.0	0	major
3	1065	my depression is really kicking my ass right n...	1	27	1	2	10	4	52667108	19698904	...	1	1	22288055	18751329	0	7781123	69219765.0	0	major
4	806	gohan in general needs more love lol will neve...	0	59	0	4	10	6	82606287	17352654	...	0	0	53384566	20731006	1	20100562	43419447.0	0	major

	Survey_id	Gender	Age	Married	Number_children	education_level	total_members	gained_asset	durable_asset	save_asset	...	incoming_salary	incoming_own_farm	incoming_business	incoming_no_business	incoming_agricultural	farm_expenses	labor_primary	lasting_investment	no_lasting_investmen	depressed
count	1429.00000	1429.000000	1429.000000	1429.000000	1429.000000	1429.000000	1429.000000	1.429000e+03	1.429000e+03	1.429000e+03	...	1429.000000	1429.000000	1429.000000	1429.000000	1.429000e+03	1.429000e+03	1429.000000	1.429000e+03	1.409000e+03	1429.000000
mean	715.00000	0.918125	34.777467	0.772568	2.883135	8.687194	4.969209	3.363448e+07	2.717296e+07	2.742471e+07	...	0.179846	0.251924	0.107768	0.260322	3.451039e+07	3.549153e+07	0.213436	3.299222e+07	3.360385e+07	0.166550
std	412.66108	0.274271	13.986219	0.419320	1.874472	2.923532	1.786317	2.003854e+07	1.815672e+07	1.775137e+07	...	0.384194	0.434270	0.310195	0.438964	2.077846e+07	2.112372e+07	0.409876	2.121621e+07	2.160228e+07	0.372704
min	1.00000	0.000000	17.000000	0.000000	0.000000	1.000000	1.000000	3.251120e+05	1.625560e+05	1.729660e+05	...	0.000000	0.000000	0.000000	0.000000	3.251120e+05	2.715050e+05	0.000000	7.429200e+04	1.263120e+05	0.000000
25%	358.00000	1.000000	25.000000	1.000000	2.000000	8.000000	4.000000	2.326982e+07	1.929852e+07	2.339998e+07	...	0.000000	0.000000	0.000000	0.000000	2.322229e+07	2.279966e+07	0.000000	2.001911e+07	2.064203e+07	0.000000
50%	715.00000	1.000000	30.000000	1.000000	3.000000	9.000000	5.000000	2.891220e+07	2.286194e+07	2.339998e+07	...	0.000000	0.000000	0.000000	0.000000	3.002882e+07	3.136343e+07	0.000000	2.841172e+07	2.829271e+07	0.000000
75%	1072.00000	1.000000	42.000000	1.000000	4.000000	10.000000	6.000000	3.717283e+07	2.656950e+07	2.339998e+07	...	0.000000	1.000000	0.000000	1.000000	4.003842e+07	4.348584e+07	0.000000	3.982686e+07	4.151762e+07	0.000000
max	1429.00000	1.000000	91.000000	1.000000	11.000000	19.000000	12.000000	9.912755e+07	9.961560e+07	9.992676e+07	...	1.000000	1.000000	1.000000	1.000000	9.978910e+07	9.965119e+07	1.000000	9.944667e+07	9.965119e+07	1.000000

	Survey_id	Gender	Age	Married	Number_children	education_level	total_members	gained_asset	durable_asset	save_asset	...	incoming_own_farm	incoming_business	incoming_no_business	incoming_agricultural	farm_expenses	labor_primary	lasting_investment	no_lasting_investmen	depressed	category
count	1429.00000	1429.000000	1429.000000	1429.000000	1429.000000	1429.000000	1429.000000	1.429000e+03	1.429000e+03	1.429000e+03	...	1429.000000	1429.000000	1429.000000	1.429000e+03	1.429000e+03	1429.000000	1.429000e+03	1.409000e+03	1429.000000	0.0
mean	715.00000	0.918125	34.777467	0.772568	2.883135	8.687194	4.969209	3.363448e+07	2.717296e+07	2.742471e+07	...	0.251924	0.107768	0.260322	3.451039e+07	3.549153e+07	0.213436	3.299222e+07	3.360385e+07	0.166550	NaN
std	412.66108	0.274271	13.986219	0.419320	1.874472	2.923532	1.786317	2.003854e+07	1.815672e+07	1.775137e+07	...	0.434270	0.310195	0.438964	2.077846e+07	2.112372e+07	0.409876	2.121621e+07	2.160228e+07	0.372704	NaN
min	1.00000	0.000000	17.000000	0.000000	0.000000	1.000000	1.000000	3.251120e+05	1.625560e+05	1.729660e+05	...	0.000000	0.000000	0.000000	3.251120e+05	2.715050e+05	0.000000	7.429200e+04	1.263120e+05	0.000000	NaN
25%	358.00000	1.000000	25.000000	1.000000	2.000000	8.000000	4.000000	2.326982e+07	1.929852e+07	2.339998e+07	...	0.000000	0.000000	0.000000	2.322229e+07	2.279966e+07	0.000000	2.001911e+07	2.064203e+07	0.000000	NaN
50%	715.00000	1.000000	30.000000	1.000000	3.000000	9.000000	5.000000	2.891220e+07	2.286194e+07	2.339998e+07	...	0.000000	0.000000	0.000000	3.002882e+07	3.136343e+07	0.000000	2.841172e+07	2.829271e+07	0.000000	NaN
75%	1072.00000	1.000000	42.000000	1.000000	4.000000	10.000000	6.000000	3.717283e+07	2.656950e+07	2.339998e+07	...	1.000000	0.000000	1.000000	4.003842e+07	4.348584e+07	0.000000	3.982686e+07	4.151762e+07	0.000000	NaN
max	1429.00000	1.000000	91.000000	1.000000	11.000000	19.000000	12.000000	9.912755e+07	9.961560e+07	9.992676e+07	...	1.000000	1.000000	1.000000	9.978910e+07	9.965119e+07	1.000000	9.944667e+07	9.965119e+07	1.000000	NaN