大众号:尤而小屋
作者:Peter
编辑:Peter
大家好,我是Peter~
本文是针对kaggle上一个真假新闻数据进行建模剖析,文章主要内容:
- 文本数据解析
- 词云图绘制
- 文本词频和长度剖析
- 语料库生成与分词
- 根据Keras的建模
原数据地址:
www.kaggle.com/code/madz20…
导入库
In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet
from bs4 import BeautifulSoup
import re,string,unicodedata
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from string import punctuation
import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import ReduceLROnPlateau
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")
In [2]:
检查当时路径下的悉数文件:
# 检查当时路径下的文件
import os
for dirname, _, filenames in os.walk('./'):
for filename in filenames:
print(os.path.join(dirname, filename))
./glove.twitter.27B.100d.txt
./.DS_Store
./Fake.csv
./glove.twitter.27B.50d.txt
./Fake.csv.zip
./glove.twitter.27B.200d.txt
./True.csv.zip
./True.csv
./kaggle实战-根据NLP的真假新闻识别.ipynb
./.ipynb_checkpoints/kaggle实战-根据NLP的真假新闻识别-checkpoint.ipynb
In [3]:
true = pd.read_csv("True.csv")
fake = pd.read_csv("Fake.csv")
数据基本信息
In [5]:
true.shape, fake.shape
Out[5]:
((21417, 4), (23481, 4))
In [6]:
true.columns
Out[6]:
Index(['title', 'text', 'subject', 'date'], dtype='object')
In [7]:
true.dtypes # 字段类型
Out[7]:
title object
text object
subject object
date object
dtype: object
In [8]:
true.isnull().sum() # 检查字段的缺失值情况
Out[8]:
title 0
text 0
subject 0
date 0
dtype: int64
In [9]:
true.head()
Out[9]:
In [10]:
true["category"] = 1 # 类别标签
fake["category"] = 0
两份数据的兼并:
In [11]:
df = pd.concat([true, fake])
df.head()
Out[11]:
检查真假数据量巨细:
In [12]:
sns.set_style("darkgrid")
sns.countplot(df.category)
plt.show()
数据探索
1、新闻标题的总数
In [13]:
df.title.count()
Out[13]:
44898
2、新闻主题的分类数量
In [14]:
df["subject"].value_counts()
Out[14]:
politicsNews 11272
worldnews 10145
News 9050
politics 6841
left-news 4459
Government News 1570
US_News 783
Middle-east 778
Name: subject, dtype: int64
In [15]:
sns.countplot(data=df,
y="subject")
plt.show()
真假新闻下的数量对比:
In [16]:
fig = sns.countplot(data=df,
x="subject",
hue="category")
fig.set_xticklabels(fig.get_xticklabels(),rotation=90)
plt.show()
数据预处理
In [18]:
import nltk
nltk.download('stopwords')
Out[18]:
True
In [19]:
# 设置停用词
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)
In [20]:
# 文本解析
def strip_html(text):
soup = BeautifulSoup(text, "html.parser")
return soup.get_text()
# 去除特殊符号
def remove_between_square_brackets(text):
return re.sub('\[[^]]*\]','',text)
# 去除URL地址
def remove_between_square_brackets(text):
return re.sub(r'http\S+', '', text)
# 去除停用词
def remove_stopwords(text):
final_text = []
for i in text.split():
if i.strip().lower() not in stop:
final_text.append(i.strip())
return " ".join(final_text)
# 删除噪音数据
def denoise_text(text):
# 顺次调用
text = strip_html(text)
text = remove_between_square_brackets(text)
text = remove_stopwords(text)
return text
In [21]:
df['text']=df['text'].apply(denoise_text)
词云图
In [22]:
# 实在数据
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 ,
width = 1600 ,
height = 800 ,
stopwords = STOPWORDS).generate(" ".join(df[df.category == 1].text))
plt.imshow(wc , interpolation = 'bilinear')
plt.show()
# 虚伪数据
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 ,
width = 1600 ,
height = 800 ,
stopwords = STOPWORDS).generate(" ".join(df[df.category == 0].text))
plt.imshow(wc , interpolation = 'bilinear')
plt.show()
真假新闻文本长度对比
In [24]:
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(12,8))
text_len=df[df['category']==1]['text'].str.len()
ax1.hist(text_len,)
ax1.set_title('True text')
text_len=df[df['category']==0]['text'].str.len()
ax2.hist(text_len,)
ax2.set_title('Fake text')
fig.suptitle('Characters in texts')
plt.show()
每条文本中的单词数量
In [25]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(12,8))
text_len=df[df['category']==1]['text'].str.split().map(lambda x: len(x))
ax1.hist(text_len, color='red')
ax1.set_title('True text')
text_len=df[df['category']==0]['text'].str.split().map(lambda x: len(x))
ax2.hist(text_len, color='green')
ax2.set_title('Fake text')
fig.suptitle('Words in texts')
plt.show()
文本中单词的平均长度对比
In [26]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(20,10))
# True
word=df[df['category']==1]['text'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),
ax=ax1,
color='red')
ax1.set_title('True text')
# False
word=df[df['category']==0]['text'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),
ax=ax2,
color='green')
ax2.set_title('Fake text')
fig.suptitle('Average word length in each text')
Out[26]:
Text(0.5, 0.98, 'Average word length in each text')
生成语料
文本切开
In [27]:
df.head()
Out[27]:
text | category | |
---|---|---|
0 | WASHINGTON (Reuters) head conservative Republi… | 1 |
1 | WASHINGTON (Reuters) Transgender people allowe… | 1 |
2 | WASHINGTON (Reuters) special counsel investiga… | 1 |
3 | WASHINGTON (Reuters) Trump campaign adviser Ge… | 1 |
4 | SEATTLE/WASHINGTON (Reuters) President Donald … | 1 |
In [28]:
def get_corpus(text):
words = []
for i in text:
for j in i.split(): # 先切开;生成的是列表
words.append(j.strip()) # 再去除空格
return words
# 调用函数
corpus = get_corpus(df.text)
corpus[:5]
Out[28]:
['WASHINGTON', '(Reuters)', 'head', 'conservative', 'Republican']
语料计数
In [29]:
from collections import Counter
# 实例化
counter = Counter(corpus)
# 最靠前的10个
most_common = counter.most_common(10)
most_common = dict(most_common)
most_common
Out[29]:
{'Trump': 111503,
'said': 93162,
'would': 54613,
'U.S.': 50441,
'President': 33180,
'people': 33115,
'also': 30325,
'one': 29370,
'Donald': 27795,
'said.': 26194}
提取文本特征
In [30]:
from sklearn.feature_extraction.text import CountVectorizer
def get_top_text_ngrams(corpus, n, g):
# CountVectorizer是归于常见的特征数值核算类,文本特征提取
# 对于每一个练习文本,它只考虑每种词汇在该练习文本中出现的频率。
# 会将文本中的词语转换为词频矩阵,它经过fit_transform函数核算各个词语出现的次数
vec = CountVectorizer(ngram_range=(g,g)).fit(corpus)
bag_of_words= vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0,idx]) for word, idx in vec.vocabulary_.items()]
words_freq = sorted(words_freq, key=lambda x:x[1], reverse=True)
return words_freq[:n]
N-gram言语模型剖析
一元模型-Unigram Analysis
In [31]:
plt.figure(figsize = (16,9))
most_common_uni = get_top_text_ngrams(df.text,10,1)
most_common_uni = dict(most_common_uni)
sns.barplot(x=list(most_common_uni.values()),
y=list(most_common_uni.keys()))
plt.show()
二元模型-Bigram Analysis
In [32]:
plt.figure(figsize = (16,9))
most_common_bi = get_top_text_ngrams(df.text,10,2)
most_common_bi = dict(most_common_bi)
sns.barplot(x=list(most_common_bi.values()),
y=list(most_common_bi.keys()))
plt.show()
三元模型-Trigram Analysis
In [33]:
plt.figure(figsize = (16,9))
most_common_tri = get_top_text_ngrams(df.text,10,3)
most_common_tri = dict(most_common_tri)
sns.barplot(x=list(most_common_tri.values()),
y=list(most_common_tri.keys()))
plt.show()
建模
数据集切分
In [34]:
x_train,x_test,y_train,y_test = train_test_split(df.text,df.category,random_state = 2023)
max_features = 10000
maxlen = 300
分词Tokenizer + 序列填充
In [35]:
# 分词
tokenizer = text.Tokenizer(num_words=max_features)
# 练习
tokenizer.fit_on_texts(x_train)
# 文本生成序列
tokenized_train = tokenizer.texts_to_sequences(x_train)
# 序列填充;保证长度相同
x_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)
In [36]:
# 测试集的填充
tokenized_test = tokenizer.texts_to_sequences(x_test)
X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen)
词向量练习-Glove使用
官网文档:nlp.stanford.edu/projects/gl…
知乎好文:zhuanlan.zhihu.com/p/50946044
In [37]:
EMBEDDING_FILE = 'glove.twitter.27B.100d.txt'
In [38]:
# 核算单词间的相关性
def get_coefs(word, *arr):
return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))
In [39]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) # 初始值设置
for word, i in word_index.items():
if i >= max_features:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
模型练习
In [40]:
# 模型参数
batch_size = 256
epochs = 10
embed_size = 100
In [41]:
# 学习率设置
learning_rate_reduction = ReduceLROnPlateau(
monitor='val_accuracy',
patience = 2,
verbose=1,
factor=0.5,
min_lr=0.00001
)
In [42]:
# Keras模型练习
model = Sequential()
model.add(Embedding(max_features,
output_dim=embed_size,
weights=[embedding_matrix],
input_length=maxlen,
trainable=False))
model.add(LSTM(units=128 ,
return_sequences = True ,
recurrent_dropout = 0.25 ,
dropout = 0.25))
model.add(LSTM(units=64,
recurrent_dropout = 0.1 ,
dropout = 0.1))
model.add(Dense(units = 32 , activation = 'relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer="rmsprop",
loss='binary_crossentropy',
metrics=['accuracy'])
In [43]:
model.summary() # 模型概要
# 模型练习
history = model.fit(x_train,
y_train,
batch_size = batch_size ,
validation_data = (X_test,y_test) ,
epochs = epochs ,
callbacks = [learning_rate_reduction])
模型剖析
In [45]:
print("Accuracy of the model on Training Data is - " , model.evaluate(x_train,y_train)[1]*100 , "%")
1053/1053 [==============================] - 135s 128ms/step - loss: 0.0065 - accuracy: 0.9981
Accuracy of the model on Training Data is - 99.81290698051453 %
In [46]:
print("Accuracy of the model on Testing Data is - " , model.evaluate(X_test,y_test)[1]*100 , "%")
351/351 [==============================] - 45s 129ms/step - loss: 0.0128 - accuracy: 0.9964
Accuracy of the model on Testing Data is - 99.64365363121033 %
结果可视化
In [47]:
# 前10个epochs
epochs = [i for i in range(10)]
fig , ax = plt.subplots(1,2)
train_acc = history.history['accuracy']
train_loss = history.history['loss']
val_acc = history.history['val_accuracy']
val_loss = history.history['val_loss']
fig.set_size_inches(20,10)
ax[0].plot(epochs , train_acc , 'go-' , label = 'Training Accuracy')
ax[0].plot(epochs , val_acc , 'ro-' , label = 'Testing Accuracy')
ax[0].set_title('Training & Testing Accuracy')
ax[0].legend()
ax[0].set_xlabel("Epochs")
ax[0].set_ylabel("Accuracy")
ax[1].plot(epochs , train_loss , 'go-' , label = 'Training Loss')
ax[1].plot(epochs , val_loss , 'ro-' , label = 'Testing Loss')
ax[1].set_title('Training & Testing Loss')
ax[1].legend()
ax[1].set_xlabel("Epochs")
ax[1].set_ylabel("Loss")
plt.show()
模型猜测
In [48]:
len(X_test)
Out[48]:
11225
In [49]:
pred = model.predict(X_test)
pred[:5]
Out[49]:
array([[5.7114285e-05],
[1.3976330e-05],
[9.9959743e-01],
[4.7492540e-05],
[1.5471596e-05]], dtype=float32)
In [50]:
y_pred=np.argmax(pred,axis=1)
y_pred
Out[51]:
array([0, 0, 0, ..., 0, 0, 0])
In [52]:
# 分类报告结果
print(classification_report(y_test, y_pred, target_names = ['Fake','Not Fake']))
precision recall f1-score support
Fake 0.53 1.00 0.70 5981
Not Fake 0.00 0.00 0.00 5244
accuracy 0.53 11225
macro avg 0.27 0.50 0.35 11225
weighted avg 0.28 0.53 0.37 11225
In [53]:
# 混淆矩阵
cm = confusion_matrix(y_test,y_pred)
cm
Out[53]:
array([[5981, 0],
[5244, 0]])
In [54]:
cm = pd.DataFrame(cm ,
index = ['Fake','Original'] ,
columns = ['Fake','Original'])
In [55]:
# 可视化
plt.figure(figsize = (10,10))
sns.heatmap(cm,
cmap= "Blues",
linecolor = 'black' ,
linewidth = 1 ,
annot = True,
fmt='' ,
xticklabels = ['Fake','Original'] ,
yticklabels = ['Fake','Original'])
plt.xlabel("Predicted")
plt.ylabel("Actual")