# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
# pandas
import pandas as pd
from pandas import Series,DataFrame
# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import missingno as missing
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
import random
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve, validation_curve
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.metrics import roc_curve, roc_auc_score ,auc, plot_roc_curve
from sklearn import svm
import sklearn.metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
df = pd.read_csv("../input/098765432/Depressiontweetdata12.csv")
# preview the data
df.head()
df.info()
df.describe()
# lets check the no. of unique items present in the categorical column
df.select_dtypes('object').nunique()
# - Get the features and labels
features=df.loc[:,df.columns!='depressed'].values[:,2:]
labels=df.loc[:,'depressed'].values
# - Get the count of each label (0 and 1) in labels
print(labels[labels==1].shape[0], labels[labels==0].shape[0])
df.groupby(["depressed"]).size().plot(kind="bar",fontsize=30)
**depressed: [ Zero: No depressed] or [One: depressed] 0 = negitive 1= positive
# import the pyplot library
import matplotlib.pyplot as plotter
# The slice names of a population distribution pie chart
pieLabels = 'Depressed','Non-Depressed'
# Population data
populationShare = [16.65,83.34]
figureObject, axesObject = plotter.subplots()
# Draw the pie chart
axesObject.pie(populationShare,
labels=pieLabels,
autopct='%1.2f',
startangle=130)
# Aspect ratio - equal means pie is a circle
axesObject.axis('equal')
plotter.show()
hist = df['Age'].hist()
sns.factorplot(x='depressed', col='Married', kind='count', data= df)
plt.figure(figsize=(25,6))
plt.subplot(1, 4, 1)
sns.distplot(df['Age'])
plt.subplot(1, 4, 2)
sns.distplot(df['Number_children'])
plt.subplot(1, 4, 3)
sns.distplot(df['education_level'])
plt.subplot(1, 4, 4)
sns.distplot(df['no_lasting_investmen'])
plt.suptitle('Checking for Skewness', fontsize = 15)
plt.show()
plt.figure(figsize=(20,15))
plt.subplot(3,3,1)
sns.countplot(x='Gender', hue='depressed', data=df)
plt.subplot(3,3,2)
sns.countplot(x='Age', hue='depressed', data=df)
plt.subplot(3,3,3)
sns.countplot(x='Number_children', hue='depressed', data=df)
plt.subplot(3,3,4)
sns.countplot(x='education_level', hue='depressed', data=df)
plt.subplot(3,3,5)
sns.countplot(x='category', hue='depressed', data=df)
plt.subplot(3,3,6)
sns.countplot(x='incoming_no_business', hue='depressed', data=df)
plt.tight_layout()
plt.show()
df.describe()
dfPairplot = df.drop(['save_asset','Survey_id' , 'category' , 'Gender' , 'Age' , 'Married' , 'Number_children' , 'education_level' , 'total_members' , 'living_expenses' , 'other_expenses' , 'incoming_salary' , 'incoming_own_farm' , 'incoming_business' , 'incoming_no_business' , 'incoming_agricultural' , 'farm_expenses' , 'labor_primary' , 'lasting_investment' , 'no_lasting_investmen'], axis=1)
dfPairplot.head()
plt.figure(figsize=(25,6))
sns.pairplot(data=dfPairplot,hue='depressed',plot_kws={'alpha':0.2})
plt.show()
facet = sns.FacetGrid(df,hue="depressed", aspect=4)
facet.map(sns.kdeplot, 'Age', shade=True)
facet.set(xlim=(0,df['Age'].max()))
facet.add_legend()
plt.show()
dfCorr = df.drop(['no_lasting_investmen'], axis=1)
plt.subplots(figsize=(20,10))
sns.heatmap(dfCorr.corr(), annot = True, fmt = ".2f")
plt.show()
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english')
cv = count_vectorizer.fit_transform(df['tweet'])
cv.shape
X_train,X_test,y_train,y_test = train_test_split(cv,df['depressed'] , test_size=.2,stratify=df['depressed'], random_state=42)
svc = svm.SVC()
svc.fit(X_train,y_train)
prediction_svc = svc.predict(X_test)
print(accuracy_score(prediction_svc,y_test))
from sklearn.datasets import make_blobs
X, y = make_blobs(100, 2, centers=2, random_state=2, cluster_std=1.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu');
X_train,X_test,y_train,y_test = train_test_split(cv,df['depressed'] , test_size=.2,stratify=df['depressed'], random_state=42)
f = feature_extraction.text.CountVectorizer(stop_words = 'english')
X = f.fit_transform(df["tweet"])
np.shape(X)
df["category"]=df["category"].map({'depresssed':1,'non-depressed':0})
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, df['category'], test_size=0.33, random_state=42)
print([np.shape(X_train), np.shape(X_test)])
from sklearn.model_selection import train_test_split
#Se parten los datos para usar 70 Entrenamiento y 30 test:
X_train_bayes, X_test_bayes, y_train_bayes, y_test_bayes = train_test_split(X, y,
test_size=.3)
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB
#Create a Gaussian Classifier
model = GaussianNB()
model.fit(X_train_bayes, y_train_bayes)
from sklearn.metrics import precision_score, recall_score, confusion_matrix, f1_score, classification_report
y_pred_bayes = model.predict(X_test_bayes)
cm = confusion_matrix(y_test_bayes, y_pred_bayes)
print(cm)
# Print the precision and recall, among other metrics
print(classification_report(y_test_bayes, y_pred_bayes, digits=2))
import numpy as np
import matplotlib.pyplot as plt
# creating the dataset
data = {'SVM-classifier':0.83, 'Naive bayes classifier':1.00}
courses = list(data.keys())
values = list(data.values())
fig = plt.figure(figsize = (10, 5))
# creating the bar plot
plt.bar(courses, values, color ='maroon',
width = 0.4)
plt.xlabel("machine learning classifier")
plt.ylabel("Accuracy score")
plt.title("compare the all classifier algorithms")
plt.show()