日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問 生活随笔!

生活随笔

當(dāng)前位置: 首頁 > 编程语言 > python >内容正文

python

python倒排索引搜索引擎_【Python】倒排索引

發(fā)布時(shí)間:2025/3/15 python 23 豆豆
生活随笔 收集整理的這篇文章主要介紹了 python倒排索引搜索引擎_【Python】倒排索引 小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.

代碼鏈接

預(yù)處理

word stemming

一個(gè)單詞可能不同的形式,在英語中比如動(dòng)詞的主被動(dòng)、單復(fù)數(shù)等。比如live\lives\lived.

雖然英文的處理看起來已經(jīng)很復(fù)雜啦但實(shí)際在中文里的處理要更加復(fù)雜的多。

stop words

比如a、the這種詞在處理的時(shí)候沒有實(shí)際意義。在這里處理的時(shí)候先對(duì)詞頻進(jìn)行統(tǒng)計(jì),人為界定停詞,簡(jiǎn)單的全部替換為空格。但是這種方式并不適用于所有的情況,對(duì)于比如,To be or not to be,這種就很難處理。

具體實(shí)現(xiàn)

Index.txt 記錄所出現(xiàn)的文件

這里將建立倒排索引分為三步

thefile.txt 所有出現(xiàn)過的詞(詞頻由高到低)

stop_word.txt 停詞

data.pkl 所創(chuàng)建的索引

1 count.py 確定停詞

2 index.py 建立倒排索引

3 query.py 用于查詢

這里在建立倒排索引的時(shí)候只記錄了出現(xiàn)的文件名,并沒有記錄在文件中出現(xiàn)的位置。

圖為count.py生成的詞頻統(tǒng)計(jì)

count.py

#-*- coding:utf-8 -*-

'''

@author birdy qian

'''

import sys

from nltk import * #import natural-language-toolkit

from operator import itemgetter #for sort

def output_count(fdist): #output the relative information

#vocabulary =fdist.items()

vocabulary =fdist.items() #get all the vocabulary

vocabulary=sorted(vocabulary, key=itemgetter(1),reverse=True) #sort the vocabulary in decreasing order

print vocabulary[:250] #print top 250 vocabulary and its count on the screen

print 'drawing plot.....' #show process

fdist.plot(120 , cumulative=False) #print the plot

#output in file

file_object = open('thefile.txt', 'w') #prepare the file for writing

for j in vocabulary:

file_object.write( j[0] + ' ') #put put all the vocabulary in decreasing order

file_object.close( ) #close the file

def pre_file(filename):

print("read file %s.txt....."%filename) #show process

content = open( str(filename) + '.txt', "r").read()

content = content.lower()

for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~' : #cancel the punction

content = content.replace(ch, " ")

plurals = content.split() #split the file at '\n' or ' '

stemmer = PorterStemmer() #prepare for stemming

singles = [stemmer.stem(plural) for plural in plurals] #handling stemming

return singles

#main function

def main():

print "read index....." #show process

input = open('index.txt', 'r') #titles that need to be handled

all_the_file =input.read( )

file=all_the_file.split()

input.close() #close the file

fdist1=FreqDist() #create a new dist

for x in range( 0, len(file) ):

#print file[x]

txt = pre_file( file[x] ) #pre handing the txt

for words in txt :

words =words.decode('utf-8').encode(sys.getfilesystemencoding()) #change string typt from utf-8 to gbk

fdist1[words] +=1 #add it to the dist

output_count(fdist1)

#runfile

if __name__ == '__main__':

main()

index.py

#-*- coding:utf-8 -*-

'''

@author birdy qian

'''

import sys

import pickle

from nltk import * #import natural-language-toolkit

from operator import itemgetter #for sort

STOPWORDS = [] #grobal variable

def output_index(result):

#print result

output = open('data.pkl', 'wb')

pickle.dump(result, output) # Pickle dictionary using protocol 0

output.close()

def pre_file(filename):

global STOPWORDS

print("read file %s.txt....."%filename) #show process

content = open( str(filename) + '.txt', "r").read()

content = content.lower()

for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_��{|}~' : #cancel the punction

content = content.replace(ch, " ")

for ch in STOPWORDS: #cancel the stopwords

content = content.replace(ch, " ")

plurals = content.split() #split the file at '\n' or ' '

stemmer = PorterStemmer() #prepare for stemming

singles = [stemmer.stem(plural) for plural in plurals] #handling stemming

return singles

def readfile(filename):

input = open(filename, 'r') #titles that need to be handled

all_the_file =input.read( )

words = all_the_file.split() #split the file at '\n' or ' '

input.close()

return words

#main function

def main():

global STOPWORDS

print "read index....." #show process

file=readfile('index.txt')

print "read stopwords....."

STOPWORDS = readfile('stop_word.txt')

print "create word list....."

word = list(readfile('thefile.txt')) #the file with all the words in all the books

result = {} #memorize the result

for x in range( 0, len(file) ):

#print file[x]

txt = pre_file( file[x] ) # file[x] is the title

txt = {}.fromkeys(txt).keys() #cancel the repeat word

#can also use text.set()

for words in txt :

words =words.decode('utf-8').encode(sys.getfilesystemencoding()) #change string typt from utf-8 to gbk

if result.get(words) == None : #if the word is not in the dictionary

result[words]=[file[x]]

else: #if the word is in the dictionary

t=result.get(words)

t.append(file[x])

result[words]=t

output_index(result)

#runfile

if __name__ == '__main__':

main()

query.py

#-*- coding:utf-8 -*-

'''

@author birdy qian

'''

import os

import sys

import pprint, pickle

from nltk import PorterStemmer

def readfile(filename):

input = open(filename, 'r') #titles that need to be handled

all_the_file =input.read( )

words = all_the_file.split() #split the file at '\n' or ' '

input.close() #close the data

return words

def getdata():

pkl_file = open('data.pkl', 'rb') #index is saved in the file 'data.pkl'

data1 = pickle.load(pkl_file) #change the type

#pprint.pprint(data1)

pkl_file.close() #close the file

return data1 #close the data

def output( result ):

#print result

if result == None: #if the words is not in the index (one word return None)

print None

return

if len(result) == 0 : #if the words is not in the index (more than one words return [] )

print None

return

if len(result) < 10 : #if the records is less than 10

print result

else: #if the records is more than 10

print 'get '+ str(len(result)) + ' records' #the record number

for i in range( 0 , len(result) / 10 +1):

print '10 records start from ' +str(i*10+1)

if 10 * i + 9 < len(result) : #print from 10 * i to 10 * i + 10

print result[ 10 * i : 10 * i + 10 ]

else: #print from 10 * i to end

print result[ 10 * i : len(result) ]

break

getstr = raw_input("Enter 'N' for next ten records & other input to quit : ")

if getstr != 'N':

break

#main function

def main():

data_list = getdata() #read data

STOPWORDS = readfile('stop_word.txt')

stemmer = PorterStemmer() #prepare for stemming

while True:

get_str = raw_input("Enter your query('\\'to quit): ")

if get_str == '\\' : #leave the loop

break

get_str = get_str.lower()

for ch in STOPWORDS: #cancel the stopwords

get_str = get_str.replace(ch, " ")

query_list=get_str.split() #split the file at '\n' or ' '

query_list = [stemmer.stem(plural) for plural in query_list] #handling stemming

while True:

if query_list != [] :

break

get_str = raw_input("Please enter more information: ")

get_str = get_str.lower()

for ch in STOPWORDS: #cancel the stopwords

get_str = get_str.replace(ch, " ")

query_list=get_str.split()

query_list = [stemmer.stem(plural) for plural in query_list] #handling stemming

result=[]

for k in range( 0 , len(query_list) ):

if k==0: #if the list has not been built

result = data_list.get( query_list[0] )

else: #if the list has been built

result = list( set(result).intersection(data_list.get( query_list[k] ) ) )

output( result )

#runfile

if __name__ == '__main__':

main()

總結(jié)

以上是生活随笔為你收集整理的python倒排索引搜索引擎_【Python】倒排索引的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯(cuò),歡迎將生活随笔推薦給好友。