當前位置：首頁 > 编程语言 > python >内容正文

python

python倒排索引搜索引擎_【Python】倒排索引

發布時間：2025/3/15 python 26 豆豆

生活随笔收集整理的這篇文章主要介紹了 python倒排索引搜索引擎_【Python】倒排索引小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

代碼鏈接

預處理

word stemming

一個單詞可能不同的形式，在英語中比如動詞的主被動、單復數等。比如live\lives\lived.

雖然英文的處理看起來已經很復雜啦但實際在中文里的處理要更加復雜的多。

stop words

比如a、the這種詞在處理的時候沒有實際意義。在這里處理的時候先對詞頻進行統計，人為界定停詞，簡單的全部替換為空格。但是這種方式并不適用于所有的情況，對于比如，To be or not to be，這種就很難處理。

具體實現

Index.txt 記錄所出現的文件

這里將建立倒排索引分為三步

thefile.txt 所有出現過的詞(詞頻由高到低)

stop_word.txt 停詞

data.pkl 所創建的索引

1 count.py 確定停詞

2 index.py 建立倒排索引

3 query.py 用于查詢

這里在建立倒排索引的時候只記錄了出現的文件名，并沒有記錄在文件中出現的位置。

圖為count.py生成的詞頻統計

count.py

#-*- coding:utf-8 -*-

'''

@author birdy qian

'''

import sys

from nltk import * #import natural-language-toolkit

from operator import itemgetter #for sort

def output_count(fdist): #output the relative information

#vocabulary =fdist.items()

vocabulary =fdist.items() #get all the vocabulary

vocabulary=sorted(vocabulary, key=itemgetter(1),reverse=True) #sort the vocabulary in decreasing order

print vocabulary[:250] #print top 250 vocabulary and its count on the screen

print 'drawing plot.....' #show process

fdist.plot(120 , cumulative=False) #print the plot

#output in file

file_object = open('thefile.txt', 'w') #prepare the file for writing

for j in vocabulary:

file_object.write( j[0] + ' ') #put put all the vocabulary in decreasing order

file_object.close( ) #close the file

def pre_file(filename):

print("read file %s.txt....."%filename) #show process

content = open( str(filename) + '.txt', "r").read()

content = content.lower()

for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~' : #cancel the punction

content = content.replace(ch, " ")

plurals = content.split() #split the file at '\n' or ' '

stemmer = PorterStemmer() #prepare for stemming

singles = [stemmer.stem(plural) for plural in plurals] #handling stemming

return singles

#main function

def main():

print "read index....." #show process

input = open('index.txt', 'r') #titles that need to be handled

all_the_file =input.read( )

file=all_the_file.split()

input.close() #close the file

fdist1=FreqDist() #create a new dist

for x in range( 0, len(file) ):

#print file[x]

txt = pre_file( file[x] ) #pre handing the txt

for words in txt :

words =words.decode('utf-8').encode(sys.getfilesystemencoding()) #change string typt from utf-8 to gbk

fdist1[words] +=1 #add it to the dist

output_count(fdist1)

#runfile

if __name__ == '__main__':

main()

index.py

#-*- coding:utf-8 -*-

'''

@author birdy qian

'''

import sys

import pickle

from nltk import * #import natural-language-toolkit

from operator import itemgetter #for sort

STOPWORDS = [] #grobal variable

def output_index(result):

#print result

output = open('data.pkl', 'wb')

pickle.dump(result, output) # Pickle dictionary using protocol 0

output.close()

def pre_file(filename):

global STOPWORDS

print("read file %s.txt....."%filename) #show process

content = open( str(filename) + '.txt', "r").read()

content = content.lower()

for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_��{|}~' : #cancel the punction

content = content.replace(ch, " ")

for ch in STOPWORDS: #cancel the stopwords

content = content.replace(ch, " ")

plurals = content.split() #split the file at '\n' or ' '

stemmer = PorterStemmer() #prepare for stemming

singles = [stemmer.stem(plural) for plural in plurals] #handling stemming

return singles

def readfile(filename):

input = open(filename, 'r') #titles that need to be handled

all_the_file =input.read( )

words = all_the_file.split() #split the file at '\n' or ' '

input.close()

return words

#main function

def main():

global STOPWORDS

print "read index....." #show process

file=readfile('index.txt')

print "read stopwords....."

STOPWORDS = readfile('stop_word.txt')

print "create word list....."

word = list(readfile('thefile.txt')) #the file with all the words in all the books

result = {} #memorize the result

for x in range( 0, len(file) ):

#print file[x]

txt = pre_file( file[x] ) # file[x] is the title

txt = {}.fromkeys(txt).keys() #cancel the repeat word

#can also use text.set()

for words in txt :

words =words.decode('utf-8').encode(sys.getfilesystemencoding()) #change string typt from utf-8 to gbk

if result.get(words) == None : #if the word is not in the dictionary

result[words]=[file[x]]

else: #if the word is in the dictionary

t=result.get(words)

t.append(file[x])

result[words]=t

output_index(result)

#runfile

if __name__ == '__main__':

main()

query.py

#-*- coding:utf-8 -*-

'''

@author birdy qian

'''

import os

import sys

import pprint, pickle

from nltk import PorterStemmer

def readfile(filename):

input = open(filename, 'r') #titles that need to be handled

all_the_file =input.read( )

words = all_the_file.split() #split the file at '\n' or ' '

input.close() #close the data

return words

def getdata():

pkl_file = open('data.pkl', 'rb') #index is saved in the file 'data.pkl'

data1 = pickle.load(pkl_file) #change the type

#pprint.pprint(data1)

pkl_file.close() #close the file

return data1 #close the data

def output( result ):

#print result

if result == None: #if the words is not in the index (one word return None)

print None

return

if len(result) == 0 : #if the words is not in the index (more than one words return [] )

print None

return

if len(result) < 10 : #if the records is less than 10

print result

else: #if the records is more than 10

print 'get '+ str(len(result)) + ' records' #the record number

for i in range( 0 , len(result) / 10 +1):

print '10 records start from ' +str(i*10+1)

if 10 * i + 9 < len(result) : #print from 10 * i to 10 * i + 10

print result[ 10 * i : 10 * i + 10 ]

else: #print from 10 * i to end

print result[ 10 * i : len(result) ]

break

getstr = raw_input("Enter 'N' for next ten records & other input to quit : ")

if getstr != 'N':

break

#main function

def main():

data_list = getdata() #read data

STOPWORDS = readfile('stop_word.txt')

stemmer = PorterStemmer() #prepare for stemming

while True:

get_str = raw_input("Enter your query('\\'to quit): ")

if get_str == '\\' : #leave the loop

break

get_str = get_str.lower()

for ch in STOPWORDS: #cancel the stopwords

get_str = get_str.replace(ch, " ")

query_list=get_str.split() #split the file at '\n' or ' '

query_list = [stemmer.stem(plural) for plural in query_list] #handling stemming

while True:

if query_list != [] :

break

get_str = raw_input("Please enter more information: ")

get_str = get_str.lower()

for ch in STOPWORDS: #cancel the stopwords

get_str = get_str.replace(ch, " ")

query_list=get_str.split()

query_list = [stemmer.stem(plural) for plural in query_list] #handling stemming

result=[]

for k in range( 0 , len(query_list) ):

if k==0: #if the list has not been built

result = data_list.get( query_list[0] )

else: #if the list has been built

result = list( set(result).intersection(data_list.get( query_list[k] ) ) )

output( result )

#runfile

if __name__ == '__main__':

main()

總結

以上是生活随笔為你收集整理的python倒排索引搜索引擎_【Python】倒排索引的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： Python学习笔记：Day 12 编写
下一篇： Python学习笔记：Day13 提升开

日韩av黄I国产麻豆传媒I国产91av视频在线观看I日韩一区二区三区在线看I美女国产在线I麻豆视频国产在线观看I成人黄色短片

python

python倒排索引搜索引擎_【Python】倒排索引

總結