當(dāng)前位置：首頁 > 编程语言 > python >内容正文

python

基于python的nlp预备知识

發(fā)布時(shí)間：2025/3/21 python 23 豆豆

生活随笔收集整理的這篇文章主要介紹了基于python的nlp预备知识小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.

基于python的nlp預(yù)備知識(shí)

載入語料庫
- brown 語料庫的導(dǎo)入
分詞
- nltk的word_tokenize
Stem抽取題干和Lemma 詞形還原
- NLTK實(shí)現(xiàn)Stemming三種方式
- NLTK實(shí)現(xiàn)Lemma 詞形還原
停止詞
關(guān)鍵詞打分
情感分析
文本相似度
- 用Frequency 頻率統(tǒng)計(jì)計(jì)算文本相似度
TF-IDF

載入語料庫

import nltk nltk.download('stopwords') nltk.download('punkt') nltk.download('brown')

brown 語料庫的導(dǎo)入

# corpus是一個(gè)語料庫，brown是brown大學(xué)制作的語料庫，關(guān)于標(biāo)題的分類 from nltk.corpus import brown brown.categories() len(brown.sents()) # 多少條句子 len(brown.words()) # 多少個(gè)詞

分詞

nltk的word_tokenize

import nltk sentence = 'hello, world' tokens = nltk.word_tokenize(sentence) # 調(diào)用庫nltk的word_tokenize進(jìn)行分詞 tokens

[‘hello’, ‘,’, ‘world’]

Stem抽取題干和Lemma 詞形還原

NLTK實(shí)現(xiàn)Stemming三種方式

# 從輸出可以看出，lancaster詞干提取器最為嚴(yán)格， # 他的速度很快，但是會(huì)減少單詞的很大部分，會(huì)讓詞干模糊難于理解print('第1種方式'+'*'*100) # 1 from nltk.stem.porter import PorterStemmerporter_stemmer = PorterStemmer() porter_stemmer.stem('maximum') # 'maximum' porter_stemmer.stem('presumably') # 'presum' porter_stemmer.stem('multiply') # 'multipli' porter_stemmer.stem('working') # workprint('第2種方式'+'*'*100) # 2 from nltk.stem.lancaster import LancasterStemmer lancaster_stemmer = LancasterStemmer() lancaster_stemmer.stem('maximum') # 'maxim' lancaster_stemmer.stem('presumably') # 'presum' lancaster_stemmer.stem('multiply') # 'multiply' porter_stemmer.stem('working') # workprint('第3種方式'+'*'*100) # 3 from nltk.stem import SnowballStemmer snowball_stemmer = SnowballStemmer('english') snowball_stemmer.stem('maximum') # 'maximum' snowball_stemmer.stem('presumably') # 'presum' snowball_stemmer.stem('multiply') # 'multipli' porter_stemmer.stem('working') # work

NLTK實(shí)現(xiàn)Lemma 詞形還原

# NLTK實(shí)現(xiàn)Lemma 詞形還原 >>> from nltk.stem import WordNetLemmatizer>>> wordnet_lemmatizer = WordNetLemmatizer() >>> wordnet_lemmatizer.lemmatize('dogs') # 'dog' >>> wordnet_lemmatizer.lemmatize('churches') # 'church' >>> wordnet_lemmatizer.lemmatize('aardwolves') # 'aardwolf' >>> wordnet_lemmatizer.lemmatize('abaci') # 'abacus' >>> wordnet_lemmatizer.lemmatize('working') # working屬于stemming，詞干抽取，所以沒用 >>> wordnet_lemmatizer.lemmatize('are') # are >>> wordnet_lemmatizer.lemmatize('are',pos = 'v') # be

停止詞

from nltk.corpus import stopwordssentence = 'food is my family' word_list = nltk.word_tokenize(sentence) # 分詞filtered_words = [word for word in word_list if word not in stopwords.words('english')] filtered_words

[‘food’, ‘is’, ‘my’, ‘family’]
[‘food’, ‘family’]
停止詞網(wǎng)站

關(guān)鍵詞打分

dict.get(key, default=None)
key – 字典中要查找的鍵。
default – 如果指定鍵的值不存在時(shí)，返回該默認(rèn)值值。
返回指定鍵的值，如果值不在字典中返回默認(rèn)值None。

# 情感分析打分 sentiment_dictionary = {} # {'abandon': -2, 'abandoned': -2,'abandons': -2...} for line in open("data/AFINN-111.txt"): # 一行一行讀第一行 abandon -2word, score = line.split('\t') # 按照tab鍵分開兩詞sentiment_dictionary[word] = int(score) # 字典格式放入# 把這個(gè)打分表記錄在一個(gè)Dict上以后 # 跑一遍整個(gè)句子，把對(duì)應(yīng)的值相加 sentence = 'like love' words = nltk.word_tokenize(sentence)total_score = sum(sentiment_dictionary.get(word, 0) for word in words) # 方法不錯(cuò) # 有值就是Dict中的值，沒有就是0 total_score

5
AFINN-111

情感分析

# 情感分析 from nltk.classify import NaiveBayesClassifier # 樸素貝葉斯# 隨手造點(diǎn)訓(xùn)練集 s1 = 'this is a good book' s2 = 'this is a awesome book' s3 = 'this is a bad book' s4 = 'this is a terrible book'def preprocess(s):return {word: True for word in s.lower().split()} # 巧妙的表達(dá)方式# {'this': True, 'is':True, 'a':True, 'good':True, 'book':True}# 當(dāng)然啦, 我們以后可以升級(jí)這個(gè)方程, 比如 word2vec# 把訓(xùn)練集給做成標(biāo)準(zhǔn)形式 training_data = [[preprocess(s1), 'pos'],[preprocess(s2), 'pos'],[preprocess(s3), 'neg'],[preprocess(s4), 'neg']]# 喂給model吃 model = NaiveBayesClassifier.train(training_data)# 打出結(jié)果 print(training_data) print(model.classify(preprocess('this is a bad book'))) # neg

[[{‘this’: True, ‘is’: True, ‘a(chǎn)’: True, ‘good’: True, ‘book’: True}, ‘pos’], [{‘this’: True, ‘is’: True, ‘a(chǎn)’: True, ‘a(chǎn)wesome’: True, ‘book’: True}, ‘pos’], [{‘this’: True, ‘is’: True, ‘a(chǎn)’: True, ‘bad’: True, ‘book’: True}, ‘neg’], [{‘this’: True, ‘is’: True, ‘a(chǎn)’: True, ‘terrible’: True, ‘book’: True}, ‘neg’]]

文本相似度

用Frequency 頻率統(tǒng)計(jì)計(jì)算文本相似度

"""功能：用元素頻次表示文本特征，計(jì)算文本相似度缺點(diǎn)：用頻次計(jì)算，丟失位置特征 """ import nltk from nltk import FreqDist import numpy as np import pandas as pd########### 制作詞庫，返回詞庫中所有單詞的頻次 ################# # 做個(gè)詞庫先 corpus = 'this is my sentence ' \'this is my life ' \'this is the day' # corpus # 'this is my sentence this is my life this is the day'# 隨便tokenize一下,這里可以根據(jù)需要做任何的preprocessing:stopwords, lemma, stemming, etc. tokens = nltk.word_tokenize(corpus)# NLTK的FreqDist統(tǒng)計(jì)一下文字出現(xiàn)的頻率 fdist = FreqDist(tokens) # fdist類似于一個(gè)Dict # FreqDist({'this': 3, 'is': 3, 'my': 2, 'sentence': 1, 'life': 1, 'the': 1, 'day': 1})# 帶上某個(gè)單詞, 可以看到它在整個(gè)文章中出現(xiàn)的次數(shù) # print(fdist['is']) # 3# 好, 此刻, 我們可以把最常用的50個(gè)單詞拿出來 standard_freq_vector = fdist.most_common(50) # 返回頻次前50的列表，單詞和頻次呈元祖格式 # [('this', 3), ('is', 3), ('my', 2), ('sentence', 1), ('life', 1), ('the', 1), ('day', 1)] size = len(standard_freq_vector) # 7，詞庫有7個(gè)def position_lookup(v):""":param v: 列表，里面是元祖格式的單詞和他對(duì)應(yīng)的頻次[('this', 3), ('is', 3), ('my', 2), ('sentence', 1), ('life', 1), ('the', 1), ('day', 1)]:return: loc： v中所有單詞和對(duì)應(yīng)的位置fre： v中所有單詞的頻次"""loc = {}fre = []counter = 0for word in v: # word遍歷v ('this', 3)loc[word[0]] = counterfre.append(word[1])counter += 1return loc, fre# 把標(biāo)準(zhǔn)的單詞位置記錄下來 loc, fre = position_lookup(standard_freq_vector) # loc： {'this': 0, 'is': 1, 'my': 2, 'sentence': 3, 'life': 4, 'the': 5, 'day': 6} # fre： [3, 3, 2, 1, 1, 1, 1]# 將詞對(duì)應(yīng)的位置和頻次，輸出pd格式 standard_vector = [key for key, value in loc.items()] df = pd.DataFrame({'詞庫': np.array(standard_vector), '詞庫頻次': fre}) print(df)################## 三個(gè)sentence，從詞庫中找sentence所有單詞出現(xiàn)的頻次 ######################## # 如果我們有個(gè)新句句?子: sentence1 = 'this is my life ' sentence2 = 'this is my sentence ' sentence3 = 'life my is this' sentence = [sentence1, sentence2, sentence3]def vec(sen_tok, loc):# 先新建一個(gè)跟我們的標(biāo)準(zhǔn)vector同樣?大?小的向量量freq_vector = [0] * sizefor word in sen_tok:try:# 如果在我們的詞庫?里里出現(xiàn)過,在"標(biāo)準(zhǔn)位置"上+1freq_vector[loc[word]] += 1except KeyError:# 如果是個(gè)新詞,就pass掉continue# print(freq_vector)return freq_vectortokens = [nltk.word_tokenize(i) for i in sentence] # 將三個(gè)句子分詞 # [['this', 'is', 'my', 'life'], ['this', 'is', 'my', 'sentence'], ['life', 'my', 'is', 'this']]sent_fre = [vec(i, loc) for i in tokens] # 分別計(jì)算三個(gè)句子中單詞在詞庫中出現(xiàn)的頻次，如果是新詞pass，所以要求詞庫要全面 # [[1, 1, 1, 0, 1, 0, 0], [1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 1, 0, 0]]# pd格式 df['sen1_頻次'] = sent_fre[0] df['sen2_頻次'] = sent_fre[1] df['sen3_頻次'] = sent_fre[2] print(df) ############### 按照頻次，依據(jù)余弦定理計(jì)算sen1與sen2，sen1與sen3的相似度 #################### # 余弦值越大，證明夾角越小，兩個(gè)向量越相似 # 分母計(jì)算模時(shí)，剛好是2范數(shù)， # 引入np.linalg.norm(表達(dá)式，ord = 2) sen1_sen2_simi = (np.sum(df['sen1_頻次']*df['sen2_頻次']))\/(np.linalg.norm(df['sen1_頻次'], ord=2) * np.linalg.norm(df['sen2_頻次'], ord=2))sen1_sen3_simi = (np.sum(df['sen1_頻次']*df['sen3_頻次']))\/(np.linalg.norm(df['sen1_頻次'], ord=2) * np.linalg.norm(df['sen3_頻次'], ord=2))print('sen1與sen2的相似度', sen1_sen2_simi) print('sen1與sen3的相似度', sen1_sen3_simi)# 可以看出雖然sen1與sen3風(fēng)馬牛不相及，但相似度達(dá)到最大，只因?yàn)槭前凑疹l次計(jì)算相似度。

TF-IDF

# NLTK實(shí)現(xiàn)TF-IDF # 文檔數(shù)：3個(gè) import nltk from nltk.text import TextCollection# 三個(gè)文檔總數(shù) sents = ['this is sentence one', 'this is sentence two', 'this is sentence three'] # 分詞 sents = [nltk.word_tokenize(sent) for sent in sents] # 放入 TextCollection corpus = TextCollection(sents)# 計(jì)算idf,驗(yàn)證公式 corpus.idf('this') # np.log(3/3)=log(一共3個(gè)文檔/出現(xiàn)this的文檔數(shù)為3)=0 corpus.idf('three') # np.log(3/1)= 1.0986122886681098# 計(jì)算tf,idf corpus.tf('three', nltk.word_tokenize('one two three, go')) # 1/5 corpus.tf_idf('three', nltk.word_tokenize('one two three, go')) # 1/5 * 1.0986122886681098=0.21972245773362198# 對(duì)于每個(gè)新句? new_sentence = 'is three, go'# 遍歷一遍所有的new_sentence中的詞: for word in nltk.word_tokenize(new_sentence):print(word, ':', 'TF-IDF', corpus.tf_idf(word, nltk.word_tokenize(new_sentence)))# is因?yàn)樵谌齻€(gè)文檔都有，所以它在新句子的重要性為0

總結(jié)

以上是生活随笔為你收集整理的基于python的nlp预备知识的全部?jī)?nèi)容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯(cuò)，歡迎將生活随笔推薦給好友。

上一篇： TensorFlow基于cifar10数
下一篇：基于python的打印进度条、计算用时