當前位置：首頁 > 编程语言 > python >内容正文

python

python实现简易搜索引擎（含代码）

發布時間：2024/1/8 python 23 豆豆

生活随笔收集整理的這篇文章主要介紹了 python实现简易搜索引擎（含代码）小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

今天我們使用python來搭建簡易的搜索引擎。

搜索引擎的本質其實就是對數據的預處理，分詞構建索引和查詢。

（這邊我們默認所有的數據都是utf-8的數據類型）

我們在一個網站上去獲取所有的URL：

def crawl(pages,depth=2):
for i in range(depth):
newpages = set()
for page in pages:
try:
c = urllib.request.urlopen(page)
except:
print('Invaild page:',page)
continue
? ? ? ? ? ?soup = bs4.BeautifulSoup(c.read())

links = soup('a')
for link in links:
if('href' in dict(link.attrs)):
url = urllib.urljoin(page,link['href'])
if url.find("'")!=-1:continue
? ? ? ? ? ? ? ? ? ?url = url.split('#')[0]
if url[0:3]=='http':
newpages.add(url)
pages = newpages

通過一個循環抓取當前頁面上所有的鏈接，我們盡可能多的去抓取鏈接，之所以選擇set而不使用list是防止重復的現象，我們可以將爬取的的網站存放到文件或者MySQL或者是MongoDB里。

output = sys.stdout
outputfile = open('lujing.txt', 'w')
sys.stdout = outputfile
list = GetFileList(lujing, [])

將生成的路徑文件lujing.txt讀取，并按照路徑文件對文本處理

# 將生成的路徑文件lujing.txt讀取，并按照路徑文件對文本處理，去標簽
for line in open("lujing.txt"):
print(line)
# line=line[0:-2]
? ?line1 = line[0:12]
line2 = line[13:16]
line3 = line[17:-1]
line4 = line[17:-6]
line = line1 + '\\' + line2 + '\\' + line3
print(line4)
path = line
fb = open(path, "rb")
data = fb.read()
bianma = chardet.detect(data)['encoding'] # 獲取當前文件的編碼方式，并按照此編碼類型處理文檔
? ?page = open(line, 'r', encoding=bianma, errors='ignore').read()
dr = re.compile(r'<[^>]+>', re.S) # 去HTML標簽
? ?dd = dr.sub('', page)
print(dd)
fname = 'TXT' + "\\" + line4 + ".txt"
? ?# print(fname)
? ?f = open(fname, "w+", encoding=bianma) # 將去標簽的文件寫到文件夾內，并按照原命名以txt文檔方式保存
? ?# fo=open(fname,"w+")
? ?f.write(dd)

下面我們進行分詞索引：

因為大家都比較熟悉sql語句那我在這里就寫成MySQL的版本了，如果需要mongodb的可以私信公眾號。

import jieba
import chardet
import pymysql
import importlib, sys
importlib.reload(sys)

# 如果使用MongoDB
# from pymongo import MongoClient
# #data processing
# client = MongoClient('localhost',27017)
# apiDB = client['urlDB'] ? ?#serverDB_name:test_nodedata
# questionnaires = apiDB['weburl']
# data = list(questionnaires.find())
conn = pymysql .connect(host="localhost",user="root",
? ? ? ? ? ? ? ? ? ? ? password="123456",db="suoyin",port=3307)
conn.text_factory = str
c = conn.cursor()
c.execute('drop table doc')
c.execute('create table doc (id int primary key,link text)')
c.execute('drop table word')
c.execute('create table word (term varchar(25) primary key,list text)')
conn.commit()
conn.close()

def Fenci():
num = 0
? ?for line in open("url.txt"):
lujing = line
print(lujing)
num += 1
? ? ? ?print(line)
line = line[17:-5]
print(line)
? ? ? ?line = 'TXT' + '\\' + line + 'Txt' ?# line為文件位置
? ? ? ?print(line) # 文件名稱
? ? ? ?path = line
fb = open(path, "rb")
data = fb.read()
bianma = chardet.detect(data)['encoding'] # 獲取文件編碼 ? ? ? ?print(bianma)
? ? ? ?# page = open(line, 'r', encoding=bianma, errors='ignore').read()
? ? ? ?# page1=page.decode('UTF-8')
? ? ? ?if bianma == 'UTF-16':
data = data.decode('UTF-16')
data = data.encode('utf-8')
word = jieba.cut_for_search(data)
seglist = list(word)
print(seglist)

# 創建數據庫
? ? ? ?c = conn.cursor() # 創建游標
? ? ? ?c.execute('insert into doc values(?,?)', (num, lujing))
# 對每個分出的詞語建立詞表
? ? ? ?for word in seglist:
# print(word)
? ? ? ? ? ?# 檢驗看看這個詞語是否已存在于數據庫
? ? ? ? ? ?c.execute('select list from word where term=?', (word,))
result = c.fetchall()
# 如果不存在
? ? ? ? ? ?if len(result) == 0:
docliststr = str(num)
c.execute('insert into word values(?,?)', (word, docliststr))
# 如果已存在
? ? ? ? ? ?else:
docliststr = result[0][0] # 得到字符串
? ? ? ? ? ? ? ?docliststr += ' ' + str(num)
c.execute('update word set list=? where term=?', (docliststr, word))
conn.commit()
conn.close()

Fenci()

最后一步，查詢：

import pymsql
import jieba
import math

conn = pymysql .connect(host="localhost",user="root",
? ? ? ? ? ? ? ? ? ? ? password="123456",db="suoyin",port=3307)
c = conn.cursor()
c.execute('select count(*) from doc')
N = 1 + c.fetchall()[0][0] # 文檔總數
target = input('請輸入搜索詞：')
seggen = jieba.cut_for_search(target)
score = {} # 文檔號：匹配度
for word in seggen:
print('得到查詢詞：', word)
# 計算score
? ?tf = {} # 文檔號：文檔數
? ?c.execute('select list from word where term=?', (word,))
result = c.fetchall()
if len(result) > 0:
doclist = result[0][0]
doclist = doclist.split(' ')
# 把字符串轉換為元素為int的list
? ? ? ?doclist = [int(x) for x in doclist]
# 當前word對應的df數
? ? ? ?df = len(set(doclist))
idf = math.log(N / df)
print('idf：', idf)
for num in doclist:
if num in tf:
tf[num] = tf[num] + 1
? ? ? ? ? ?else:
tf[num] = 1
? ? ? ?# tf統計結束，現在開始計算score
? ? ? ?for num in tf:
if num in score:
# 如果該num文檔已經有分數了，則累加
? ? ? ? ? ? ? ?score[num] = score[num] + tf[num] * idf
else:
score[num] = tf[num] * idf
sortedlist = sorted(score.items(), key=lambda d: d[1], reverse=True)

cnt = 0
for num, docscore in sortedlist:
cnt = cnt + 1
? ?c.execute('select link from doc where id=?', (num,))
url = c.fetchall()[0][0]
print("Result Ranking：", cnt)
print('url：', url, 'match degree：', docscore)

if cnt > 20:
break
if cnt == 0:
print('No result')

搞定。

總結

以上是生活随笔為你收集整理的python实现简易搜索引擎（含代码）的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：读《470个建筑设计创意发想》
下一篇： python为啥爬取数据会有重复_使用p