In [58]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/098765432/Depressiontweetdata12.csv

import the libraries

In [2]:
# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import missingno as missing
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

import random
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve, validation_curve
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.metrics import roc_curve, roc_auc_score ,auc, plot_roc_curve
from sklearn import svm
import sklearn.metrics

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

Data reading

In [3]:
df = pd.read_csv("../input/098765432/Depressiontweetdata12.csv")

# preview the data
df.head()
Out[3]:
Survey_id tweet Gender Age Married Number_children education_level total_members gained_asset durable_asset ... incoming_own_farm incoming_business incoming_no_business incoming_agricultural farm_expenses labor_primary lasting_investment no_lasting_investmen depressed category
0 926 damn taking this personality quiz and realizin... 1 28 1 4 10 5 28912201 22861940 ... 0 0 0 30028818 31363432 0 28411718 28292707.0 0 major
1 747 gohan in general needs more love lol will neve... 1 23 1 3 8 5 28912201 22861940 ... 0 0 0 30028818 31363432 0 28411718 28292707.0 1 major
2 1190 damn taking this personality quiz and realizin... 1 22 1 3 9 5 28912201 22861940 ... 0 0 0 30028818 31363432 0 28411718 28292707.0 0 major
3 1065 my depression is really kicking my ass right n... 1 27 1 2 10 4 52667108 19698904 ... 1 0 1 22288055 18751329 0 7781123 69219765.0 0 major
4 806 gohan in general needs more love lol will neve... 0 59 0 4 10 6 82606287 17352654 ... 0 0 0 53384566 20731006 1 20100562 43419447.0 0 major

5 rows × 24 columns

Training Data Info

In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1429 entries, 0 to 1428
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Survey_id              1429 non-null   int64  
 1   tweet                  1429 non-null   object 
 2   Gender                 1429 non-null   int64  
 3   Age                    1429 non-null   int64  
 4   Married                1429 non-null   int64  
 5   Number_children        1429 non-null   int64  
 6   education_level        1429 non-null   int64  
 7   total_members          1429 non-null   int64  
 8   gained_asset           1429 non-null   int64  
 9   durable_asset          1429 non-null   int64  
 10  save_asset             1429 non-null   int64  
 11  living_expenses        1429 non-null   int64  
 12  other_expenses         1429 non-null   int64  
 13  incoming_salary        1429 non-null   int64  
 14  incoming_own_farm      1429 non-null   int64  
 15  incoming_business      1429 non-null   int64  
 16  incoming_no_business   1429 non-null   int64  
 17  incoming_agricultural  1429 non-null   int64  
 18  farm_expenses          1429 non-null   int64  
 19  labor_primary          1429 non-null   int64  
 20  lasting_investment     1429 non-null   int64  
 21  no_lasting_investmen   1409 non-null   float64
 22  depressed              1429 non-null   int64  
 23  category               1429 non-null   object 
dtypes: float64(1), int64(21), object(2)
memory usage: 268.1+ KB
In [5]:
df.describe()
Out[5]:
Survey_id Gender Age Married Number_children education_level total_members gained_asset durable_asset save_asset ... incoming_salary incoming_own_farm incoming_business incoming_no_business incoming_agricultural farm_expenses labor_primary lasting_investment no_lasting_investmen depressed
count 1429.00000 1429.000000 1429.000000 1429.000000 1429.000000 1429.000000 1429.000000 1.429000e+03 1.429000e+03 1.429000e+03 ... 1429.000000 1429.000000 1429.000000 1429.000000 1.429000e+03 1.429000e+03 1429.000000 1.429000e+03 1.409000e+03 1429.000000
mean 715.00000 0.918125 34.777467 0.772568 2.883135 8.687194 4.969209 3.363448e+07 2.717296e+07 2.742471e+07 ... 0.179846 0.251924 0.107768 0.260322 3.451039e+07 3.549153e+07 0.213436 3.299222e+07 3.360385e+07 0.166550
std 412.66108 0.274271 13.986219 0.419320 1.874472 2.923532 1.786317 2.003854e+07 1.815672e+07 1.775137e+07 ... 0.384194 0.434270 0.310195 0.438964 2.077846e+07 2.112372e+07 0.409876 2.121621e+07 2.160228e+07 0.372704
min 1.00000 0.000000 17.000000 0.000000 0.000000 1.000000 1.000000 3.251120e+05 1.625560e+05 1.729660e+05 ... 0.000000 0.000000 0.000000 0.000000 3.251120e+05 2.715050e+05 0.000000 7.429200e+04 1.263120e+05 0.000000
25% 358.00000 1.000000 25.000000 1.000000 2.000000 8.000000 4.000000 2.326982e+07 1.929852e+07 2.339998e+07 ... 0.000000 0.000000 0.000000 0.000000 2.322229e+07 2.279966e+07 0.000000 2.001911e+07 2.064203e+07 0.000000
50% 715.00000 1.000000 30.000000 1.000000 3.000000 9.000000 5.000000 2.891220e+07 2.286194e+07 2.339998e+07 ... 0.000000 0.000000 0.000000 0.000000 3.002882e+07 3.136343e+07 0.000000 2.841172e+07 2.829271e+07 0.000000
75% 1072.00000 1.000000 42.000000 1.000000 4.000000 10.000000 6.000000 3.717283e+07 2.656950e+07 2.339998e+07 ... 0.000000 1.000000 0.000000 1.000000 4.003842e+07 4.348584e+07 0.000000 3.982686e+07 4.151762e+07 0.000000
max 1429.00000 1.000000 91.000000 1.000000 11.000000 19.000000 12.000000 9.912755e+07 9.961560e+07 9.992676e+07 ... 1.000000 1.000000 1.000000 1.000000 9.978910e+07 9.965119e+07 1.000000 9.944667e+07 9.965119e+07 1.000000

8 rows × 22 columns

In [6]:
# lets check the no. of unique items present in the categorical column

df.select_dtypes('object').nunique()
Out[6]:
tweet       75
category     5
dtype: int64

* # ** classification

In [7]:
# - Get the features and labels
features=df.loc[:,df.columns!='depressed'].values[:,2:]
labels=df.loc[:,'depressed'].values
In [8]:
# - Get the count of each label (0 and 1) in labels
print(labels[labels==1].shape[0], labels[labels==0].shape[0])
238 1191
In [9]:
df.groupby(["depressed"]).size().plot(kind="bar",fontsize=30)
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x7efc90205cd0>

**depressed: [ Zero: No depressed] or [One: depressed] 0 = negitive 1= positive

In [10]:
# import the pyplot library

import matplotlib.pyplot as plotter

 

# The slice names of a population distribution pie chart

pieLabels              = 'Depressed','Non-Depressed'

 

# Population data

populationShare     = [16.65,83.34]

 

figureObject, axesObject = plotter.subplots()

 

# Draw the pie chart

axesObject.pie(populationShare,

        labels=pieLabels,

        autopct='%1.2f',

        startangle=130)

 

# Aspect ratio - equal means pie is a circle

axesObject.axis('equal')

 

plotter.show()
In [44]:
hist = df['Age'].hist()
In [48]:
sns.factorplot(x='depressed', col='Married', kind='count', data= df)
Out[48]:
<seaborn.axisgrid.FacetGrid at 0x7efc8ffbc990>
In [11]:
plt.figure(figsize=(25,6))
plt.subplot(1, 4, 1)
sns.distplot(df['Age'])

plt.subplot(1, 4, 2)
sns.distplot(df['Number_children'])

plt.subplot(1, 4, 3)
sns.distplot(df['education_level'])

plt.subplot(1, 4, 4)
sns.distplot(df['no_lasting_investmen'])

plt.suptitle('Checking for Skewness', fontsize = 15)
plt.show()

depression classes withAGE,Gender,depression category wise with comparison

In [12]:
plt.figure(figsize=(20,15))

plt.subplot(3,3,1)
sns.countplot(x='Gender', hue='depressed', data=df)
plt.subplot(3,3,2)
sns.countplot(x='Age', hue='depressed', data=df)
plt.subplot(3,3,3)
sns.countplot(x='Number_children', hue='depressed', data=df)

plt.subplot(3,3,4)
sns.countplot(x='education_level', hue='depressed', data=df)
plt.subplot(3,3,5)
sns.countplot(x='category', hue='depressed', data=df)
plt.subplot(3,3,6)
sns.countplot(x='incoming_no_business', hue='depressed', data=df)

plt.tight_layout()
plt.show()

depressed vs all features

In [60]:
df.describe()
Out[60]:
Survey_id Gender Age Married Number_children education_level total_members gained_asset durable_asset save_asset ... incoming_own_farm incoming_business incoming_no_business incoming_agricultural farm_expenses labor_primary lasting_investment no_lasting_investmen depressed category
count 1429.00000 1429.000000 1429.000000 1429.000000 1429.000000 1429.000000 1429.000000 1.429000e+03 1.429000e+03 1.429000e+03 ... 1429.000000 1429.000000 1429.000000 1.429000e+03 1.429000e+03 1429.000000 1.429000e+03 1.409000e+03 1429.000000 0.0
mean 715.00000 0.918125 34.777467 0.772568 2.883135 8.687194 4.969209 3.363448e+07 2.717296e+07 2.742471e+07 ... 0.251924 0.107768 0.260322 3.451039e+07 3.549153e+07 0.213436 3.299222e+07 3.360385e+07 0.166550 NaN
std 412.66108 0.274271 13.986219 0.419320 1.874472 2.923532 1.786317 2.003854e+07 1.815672e+07 1.775137e+07 ... 0.434270 0.310195 0.438964 2.077846e+07 2.112372e+07 0.409876 2.121621e+07 2.160228e+07 0.372704 NaN
min 1.00000 0.000000 17.000000 0.000000 0.000000 1.000000 1.000000 3.251120e+05 1.625560e+05 1.729660e+05 ... 0.000000 0.000000 0.000000 3.251120e+05 2.715050e+05 0.000000 7.429200e+04 1.263120e+05 0.000000 NaN
25% 358.00000 1.000000 25.000000 1.000000 2.000000 8.000000 4.000000 2.326982e+07 1.929852e+07 2.339998e+07 ... 0.000000 0.000000 0.000000 2.322229e+07 2.279966e+07 0.000000 2.001911e+07 2.064203e+07 0.000000 NaN
50% 715.00000 1.000000 30.000000 1.000000 3.000000 9.000000 5.000000 2.891220e+07 2.286194e+07 2.339998e+07 ... 0.000000 0.000000 0.000000 3.002882e+07 3.136343e+07 0.000000 2.841172e+07 2.829271e+07 0.000000 NaN
75% 1072.00000 1.000000 42.000000 1.000000 4.000000 10.000000 6.000000 3.717283e+07 2.656950e+07 2.339998e+07 ... 1.000000 0.000000 1.000000 4.003842e+07 4.348584e+07 0.000000 3.982686e+07 4.151762e+07 0.000000 NaN
max 1429.00000 1.000000 91.000000 1.000000 11.000000 19.000000 12.000000 9.912755e+07 9.961560e+07 9.992676e+07 ... 1.000000 1.000000 1.000000 9.978910e+07 9.965119e+07 1.000000 9.944667e+07 9.965119e+07 1.000000 NaN

8 rows × 23 columns

In [13]:
dfPairplot = df.drop(['save_asset','Survey_id' , 'category' , 'Gender' , 'Age' , 'Married' , 'Number_children' , 'education_level' , 'total_members' , 'living_expenses' , 'other_expenses' , 'incoming_salary' , 'incoming_own_farm' , 'incoming_business' , 'incoming_no_business' , 'incoming_agricultural' , 'farm_expenses' , 'labor_primary' , 'lasting_investment' , 'no_lasting_investmen'], axis=1)
dfPairplot.head()
plt.figure(figsize=(25,6))
sns.pairplot(data=dfPairplot,hue='depressed',plot_kws={'alpha':0.2})
plt.show()
<Figure size 1800x432 with 0 Axes>

age vs depressed

In [14]:
facet = sns.FacetGrid(df,hue="depressed", aspect=4)
facet.map(sns.kdeplot, 'Age', shade=True)
facet.set(xlim=(0,df['Age'].max()))
facet.add_legend()

plt.show()

Confusion -matrix of classes

In [15]:
dfCorr = df.drop(['no_lasting_investmen'], axis=1)
In [16]:
plt.subplots(figsize=(20,10)) 
sns.heatmap(dfCorr.corr(), annot = True, fmt = ".2f")
plt.show()

Modeling

(implement-SVM-Classifier)

In [17]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

train the model

In [19]:
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
In [20]:
count_vectorizer = CountVectorizer(stop_words='english') 
cv = count_vectorizer.fit_transform(df['tweet'])
cv.shape
Out[20]:
(1429, 157)
In [21]:
X_train,X_test,y_train,y_test = train_test_split(cv,df['depressed'] , test_size=.2,stratify=df['depressed'], random_state=42)
In [22]:
svc = svm.SVC()
svc.fit(X_train,y_train)
prediction_svc = svc.predict(X_test)
print(accuracy_score(prediction_svc,y_test))
0.8321678321678322
In [28]:
from sklearn.datasets import make_blobs
X, y = make_blobs(100, 2, centers=2, random_state=2, cluster_std=1.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu');

Naive bayes classifier

In [23]:
X_train,X_test,y_train,y_test = train_test_split(cv,df['depressed'] , test_size=.2,stratify=df['depressed'], random_state=42)
  1. # Model training/fiting
In [25]:
f = feature_extraction.text.CountVectorizer(stop_words = 'english')
X = f.fit_transform(df["tweet"])
np.shape(X)
Out[25]:
(1429, 157)
In [26]:
df["category"]=df["category"].map({'depresssed':1,'non-depressed':0})
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, df['category'], test_size=0.33, random_state=42)
print([np.shape(X_train), np.shape(X_test)])
[(957, 157), (472, 157)]
In [51]:
from sklearn.model_selection import train_test_split

#Se parten los datos para usar 70 Entrenamiento y 30 test:
X_train_bayes, X_test_bayes, y_train_bayes, y_test_bayes = train_test_split(X, y,
                                                    test_size=.3)
In [52]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
model = GaussianNB()
model.fit(X_train_bayes, y_train_bayes)
Out[52]:
GaussianNB()
In [53]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, f1_score, classification_report
y_pred_bayes = model.predict(X_test_bayes)
In [55]:
cm = confusion_matrix(y_test_bayes, y_pred_bayes)
print(cm)
# Print the precision and recall, among other metrics
print(classification_report(y_test_bayes, y_pred_bayes, digits=2))
[[16  0]
 [ 0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      1.00      1.00        14

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

Comparison of ML classifier( accuracy )

In [57]:
import numpy as np 
import matplotlib.pyplot as plt  
  
   
# creating the dataset 
data = {'SVM-classifier':0.83, 'Naive bayes classifier':1.00} 
courses = list(data.keys()) 
values = list(data.values()) 
   
fig = plt.figure(figsize = (10, 5)) 
  
# creating the bar plot 
plt.bar(courses, values, color ='maroon',  
        width = 0.4) 
  
plt.xlabel("machine learning classifier") 
plt.ylabel("Accuracy score") 
plt.title("compare the all classifier algorithms") 
plt.show() 
In [ ]: