Fake News Detection : Forums : PythonAnywhere

I dont know how to apply what you guys are saying. I dont have much knowledge about Machine Learning. This is my complete source code. Please help me.
#!/usr/bin/env python
# coding: utf-8

# In[16]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import re
import string
import nltk
from collections import Counter
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neural_network.multilayer_perceptron import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,mean_absolute_error,mean_squared_error

# In[17]:

fn_df = pd.read_csv('train.csv')

# In[ ]:

fn_df.shape

# In[ ]:

fn_df.head()

# In[ ]:

fn_df.tail()

# In[ ]:

fn_df['title'].iloc[1]

# In[ ]:

fn_df['text'].iloc[1]

# In[ ]:

fn_df['text'].iloc[1]

# In[ ]:

fn_df.info()

# In[ ]:

fn_df['label'].value_counts()

# In[ ]:

fn_df.isnull().any()

# In[ ]:

fn_df.isnull().sum()

# # EDA

# In[ ]:

def word_count(text):
    return len(str(text).split())

# In[ ]:

fn_df['word_count'] = fn_df['text'].apply(word_count)

# In[ ]:

fn_df['char_count'] = fn_df['text'].str.len()

# In[ ]:

def upper_count(text):
    is_upper_len = len([word for word in word_tokenize(str(text)) if word.isupper()])
    return is_upper_len

# In[ ]:

fn_df['is_upper'] = fn_df['text'].apply(upper_count)

# In[ ]:

def stop_words_count(text):
    stop_words = stopwords.words('english')
    stopwords_len = len([word for word in word_tokenize(str(text)) if word in stop_words])
    return stopwords_len

# In[22]:

fn_df['stopword_presence'] = fn_df['text'].apply(stop_words_count)

# In[23]:

def check_punctuation(text):
    punct_len = len([punct for punct in word_tokenize(str(text)) if punct in string.punctuation])
    if punct_len < 0:
        return 'False'
    else:
        return 'True'

# In[24]:

fn_df['punct check'] = fn_df['text'].apply(check_punctuation)

# In[25]:

fn_df.head()

# In[26]:

fn_df.describe()

# # VISUALIZATION

# In[27]:

fn_df['text'] = fn_df['text'].apply(lambda text: str(text).lower())

# In[28]:

sns.countplot(x='label',data=fn_df)

# In[29]:

real_tag = ''.join(news for news in fn_df['text'][fn_df['label']==1])

# In[30]:

fake_tag = ''.join(news for news in fn_df['text'][fn_df['label']==0])

# In[54]:

real_tag_most_word = pd.Series(Counter(str(real_tag).split()).most_common(25))

# In[55]:

word_list =[]
freq_list =[]
for word, freq in real_tag_most_word:
    word_list.append(word)
    freq_list.append(freq)

real_dict = {'Word':word_list,'Frequency':freq_list}
real_tag_most_word_df = pd.DataFrame(real_dict)
real_tag_most_word_df.head(25)

# In[56]:

plt.figure(figsize=(12,12))
plt.bar(x=real_tag_most_word_df['Word'],height= real_tag_most_word_df['Frequency'])
plt.xticks(rotation=55)
plt.show()

# In[31]:

fake_tag_most_word = pd.Series(Counter(str(fake_tag).split()).most_common())

# In[32]:

wordcloud = WordCloud(background_color='white',stopwords=STOPWORDS,max_words=50)

# In[33]:

# wc = wordcloud.generate(fake_tag)
# plt.figure(figsize=(6,4))
# plt.axis('off')
# plt.title('fake news word cloud')
# plt.imshow(wc)

# In[34]:

# wc = wordcloud.generate(real_tag)
# plt.figure(figsize=(6,4))
# plt.axis('off')
# plt.title('real news word cloud')
# # plt.imshow(wc)

# # cleaning

# In[36]:

fn_df['text'].head()

# In[50]:

def remove_punctuation(text,keep_apostrophes=True):
    text = text.strip()
    if keep_apostrophes:
        PATTERN = r'[?|$|&|*|.|,|!|"|#|\|+|-|/|:|–|;|<|=|>|[|^|_|{|”|“|—|’|`|%|@|(|)|~]'
        #'[?|$|&|*|.|,|!|"|#|\|+|-|/|:|;|<|=|>|[|^|_|{|`|%|@|(|)|~]' 
        filtered_text = re.sub(PATTERN, r'', text)
    else:
        PATTERN = r'[^a-zA-Z0-9 ]'
        filtered_text = re.sub(PATTERN, r'', text)
    return filtered_text

# In[51]:

fn_df['text'] = fn_df['text'].apply(remove_punctuation)

# In[38]:

fn_df.head()

# In[39]:

fn_df.tail()

# In[40]:

def remove_stopwords(text):
    stop_words = stopwords.words('english')
    clean_text = [word for word in word_tokenize(text) if 
                    word not in stop_words]
    return ' '.join(clean_text)

# In[41]:

fn_df['text'] = fn_df['text'].apply(remove_stopwords)

# In[39]:

fn_df.head()

# In[34]:

def stemming(text):
    steming = PorterStemmer()
    stem = [steming.stem(word) for word in word_tokenize(text)]
    return ' '.join(stem)

# In[35]:

fn_df['text'] = fn_df['text'].apply(stemming)

# In[36]:

fn_df.head()

# In[37]:

fn_df.tail()

# In[38]:

fn_df.columns

# In[ ]:

# # splitting data into training and testing set

# In[52]:

fn_train,fn_test,label_train,label_test = train_test_split(fn_df['text'],fn_df['label'],test_size=0.3,shuffle=True)

# In[53]:

print(fn_train.shape, fn_test.shape,label_train.shape,label_test.shape)

# # FEATURE EXTRACTION

# In[54]:

cv = CountVectorizer()
tfidf = TfidfTransformer(sublinear_tf=True)

# In[55]:

fake_news_train_cv = cv.fit_transform(fn_train)
fake_news_test_cv = cv.transform(fn_test)

# In[56]:

fake_news_train_tfidf = tfidf.fit_transform(fake_news_train_cv)
fake_news_test_tfidf =  tfidf.transform(fake_news_test_cv)

# # DATA MODELLING

# In[57]:

rfc = RandomForestClassifier(random_state=42)
rfc.fit(fake_news_train_tfidf,label_train)
rfc_pred = rfc.predict(fake_news_test_tfidf)
print(accuracy_score(label_test,rfc_pred))

# In[58]:

gbc = GradientBoostingClassifier()
gbc.fit(fake_news_train_tfidf,label_train)
gbc_pred = gbc.predict(fake_news_test_tfidf)
print(accuracy_score(label_test,gbc_pred))

# In[59]:

mlc = MLPClassifier(hidden_layer_sizes=3)
mlc.fit(fake_news_train_tfidf,label_train)
mlc_pred = mlc.predict(fake_news_test_tfidf)
print(accuracy_score(label_test,mlc_pred))

# In[60]:

svc = SVC()
svc.fit(fake_news_train_tfidf,label_train)
svc_pred = svc.predict(fake_news_test_tfidf)
print(accuracy_score(label_test,svc_pred))

# In[61]:

mnb = MultinomialNB()
mnb.fit(fake_news_train_tfidf,label_train)
mnb_pred = mnb.predict(fake_news_test_tfidf)
print(accuracy_score(label_test,mnb_pred))

# In[62]:

dtc = DecisionTreeClassifier()
dtc.fit(fake_news_train_tfidf,label_train)
dtc_pred = dtc.predict(fake_news_test_tfidf)
print(accuracy_score(label_test,dtc_pred))

# # EVALUATING MODELS PERFORMANCE

# In[63]:

print('Classification report for RFC:')
print(classification_report(label_test,rfc_pred))
print('----------------------------------------')
print('Classification report for GBC:')
print(classification_report(label_test,gbc_pred))
print('----------------------------------------')
print('Classification report for MLP:')
print(classification_report(label_test,mlc_pred))
print('----------------------------------------')
print('Classification report for MNB:')
print(classification_report(label_test,mnb_pred))
print('----------------------------------------')
print('Classification report for DTC:')
print(classification_report(label_test,dtc_pred))
print('----------------------------------------')

# In[64]:

print('Confusion matrix for RFC:')
print(confusion_matrix(label_test,rfc_pred))
print('----------------------------------------')
print('Confusion matrix for GBC:')
print(confusion_matrix(label_test,gbc_pred))
print('----------------------------------------')
print('Confusion matrix for MLP:')
print(confusion_matrix(label_test,mlc_pred))
print('----------------------------------------')
print('Classification report for MNb:')
print(confusion_matrix(label_test,mnb_pred))
print('----------------------------------------')
print('Confusion matrix for DTC:')
print(confusion_matrix(label_test,dtc_pred))
print('----------------------------------------')

# In[ ]:

# In[65]:

# # tokenize document
# tokens = wpt.tokenize(doc)
# # filter stopwords out of document
# filtered_tokens = [token for token in tokens if token not in stop_words]
# # re-create document from filtered tokens
# doc = ' '.join(filtered_tokens

# In[66]:

# def rem(sentence,keep_apostrophes=False):
#     sentence = sentence.strip()
#     if keep_apostrophes:
#         PATTERN = r'[?|$|&|*|.|,|%|@|(|)|~]' # add other characters here to
#         #remove them
#         filtered_sentence = re.sub(PATTERN, r'', sentence)
#     else:
#         PATTERN = r'[^a-zA-Z0-9 ]' # only extract alpha-numeric characters
#         filtered_sentence = re.sub(PATTERN, r'', sentence)
#     return filtered_sentence

# In[98]:

ss= 'where.... atre; you "coming" from? youre t”her’e oo."...,,[[]]/'
remove_punctuation(ss)

# In[69]:

print("Random forest model accuracy",accuracy_score(label_test,rfc_pred)*100)
print("------------------------------------------------------------------")
print("Gradient Boosting model accuracy",accuracy_score(label_test,gbc_pred)*100)
print("------------------------------------------------------------------")
print("Multilayer perception model accuracy",accuracy_score(label_test,mlc_pred)*100)
print("------------------------------------------------------------------")
print("Support vector machine model accuracy",accuracy_score(label_test,svc_pred)*100)
print("------------------------------------------------------------------")
print("Naive Bayes model accuracy",accuracy_score(label_test,mnb_pred)*100)
print("------------------------------------------------------------------")
print("Decision Tree model accuracy",accuracy_score(label_test,dtc_pred)*100)
deleted-user-5656355 | 14 posts | March 29, 2020, 4:19 p.m. | permalink