被忽视的fuzzywuzzy库
生活随笔
收集整理的這篇文章主要介紹了
被忽视的fuzzywuzzy库
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
fuzzywuzzy包一個可以對字符串進行模糊匹配的包
from fuzzywuzzy import fuzz from fuzzywuzzy import process字符串的對比
fuzz.ratio()對位置敏感,全匹配
fuzz.partial_ratio()非完全匹配
str1 = '毛利是個小菜比'str2 = '毛利是個小菜比,毛利是個小菜比'print("fuzz.ratio相似度:",fuzz.ratio(str1,str2))print("fuzz.partial_ratio相似度:",fuzz.partial_ratio(str1,str2)) fuzz.ratio相似度: 64 fuzz.partial_ratio相似度: 100 str1 = '毛利說:是個小菜比' str2 = '毛利說是個小菜比' print("fuzz.ratio相似度:",fuzz.ratio(str1,str2)) print("fuzz.partial_ratio相似度:",fuzz.partial_ratio(str1,str2)) fuzz.ratio相似度: 94 fuzz.partial_ratio相似度: 88忽略順序匹配(token_sort_ratio)
str1 = '毛利說:是個小菜比' str2 = '是個小菜比:毛利說' print("fuzz.ratio相似度:",fuzz.ratio(str1,str2)) print("fuzz.partial_ratio相似度:",fuzz.partial_ratio(str1,str2)) print("token_sort_ratio相似度:",fuzz.token_sort_ratio(str1,str2)) fuzz.ratio相似度: 56 fuzz.partial_ratio相似度: 56 token_sort_ratio相似度: 100去重子集匹配(token_set_ratio)
str1 = '毛利說:是個小菜比' str2 = '毛利說:是個小小菜比' print("fuzz.ratio相似度:",fuzz.ratio(str1,str2)) print("fuzz.partial_ratio相似度:",fuzz.partial_ratio(str1,str2)) print("token_sort_ratio相似度:",fuzz.token_sort_ratio(str1,str2)) print("token_set_ratio相似度:",fuzz.token_set_ratio(str1,str2)) fuzz.ratio相似度: 95 fuzz.partial_ratio相似度: 89 token_sort_ratio相似度: 95 token_set_ratio相似度: 95 print(fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")) print(fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")) 84 100process
用來返回模糊匹配的字符串和相似度
choices = ["python爬蟲教程", "python機器學習教程", "Python數據分析教程", "pythonweb開發教程"] print(process.extract("數據分析", choices, limit=3)) print(process.extractOne("分析", choices)) [('Python數據分析教程', 90), ('python爬蟲教程', 0), ('python機器學習教程', 0)] ('Python數據分析教程', 90)案例
求和
import numpy as np import pandas as pd from fuzzywuzzy import fuzz from fuzzywuzzy import process def enum_row(row):print(row['state']) def find_state_code(row):if row['state'] != 0:print(process.extractOne(row['state'], states, score_cutoff=80)) def capital(str):return str.capitalize() def correct_state(row):if row['state'] != 0:state = process.extractOne(row['state'], states, score_cutoff=80)if state:state_name = state[0]return ' '.join(map(capital, state_name.split(' ')))return row['state'] def fill_state_code(row):if row['state'] != 0:state = process.extractOne(row['state'], states, score_cutoff=80)if state:state_name = state[0]return state_to_code[state_name]return ''if __name__ == "__main__":pd.set_option('display.width', 200)data = pd.read_excel('.\\sales.xlsx', sheet_name='sheet1', header=0)print('data.head() = \n', data.head())print('data.tail() = \n', data.tail())print('data.dtypes = \n', data.dtypes)print('data.columns = \n', data.columns)for c in data.columns:print(c, end=' ')print()data['total'] = data['Jan'] + data['Feb'] + data['Mar']print(data.head())print(data['Jan'].sum())print(data['Jan'].min())print(data['Jan'].max())print(data['Jan'].mean())print('=============')# 添加一行s1 = data[['Jan', 'Feb', 'Mar', 'total']].sum()print(s1)s2 = pd.DataFrame(data=s1)print(s2)print(s2.T)print(s2.T.reindex(columns=data.columns))# 即:s = pd.DataFrame(data=data[['Jan', 'Feb', 'Mar', 'total']].sum()).Ts = s.reindex(columns=data.columns, fill_value=0)print(s)data = data.append(s, ignore_index=True)data = data.rename(index={15:'Total'})print(data.tail())# apply的使用print('==============apply的使用==========')data.apply(enum_row, axis=1)state_to_code = {"VERMONT": "VT", "GEORGIA": "GA", "IOWA": "IA", "Armed Forces Pacific": "AP", "GUAM": "GU","KANSAS": "KS", "FLORIDA": "FL", "AMERICAN SAMOA": "AS", "NORTH CAROLINA": "NC", "HAWAII": "HI","NEW YORK": "NY", "CALIFORNIA": "CA", "ALABAMA": "AL", "IDAHO": "ID","FEDERATED STATES OF MICRONESIA": "FM","Armed Forces Americas": "AA", "DELAWARE": "DE", "ALASKA": "AK", "ILLINOIS": "IL","Armed Forces Africa": "AE", "SOUTH DAKOTA": "SD", "CONNECTICUT": "CT", "MONTANA": "MT","MASSACHUSETTS": "MA","PUERTO RICO": "PR", "Armed Forces Canada": "AE", "NEW HAMPSHIRE": "NH", "MARYLAND": "MD","NEW MEXICO": "NM","MISSISSIPPI": "MS", "TENNESSEE": "TN", "PALAU": "PW", "COLORADO": "CO","Armed Forces Middle East": "AE","NEW JERSEY": "NJ", "UTAH": "UT", "MICHIGAN": "MI", "WEST VIRGINIA": "WV", "WASHINGTON": "WA","MINNESOTA": "MN", "OREGON": "OR", "VIRGINIA": "VA", "VIRGIN ISLANDS": "VI","MARSHALL ISLANDS": "MH","WYOMING": "WY", "OHIO": "OH", "SOUTH CAROLINA": "SC", "INDIANA": "IN", "NEVADA": "NV","LOUISIANA": "LA","NORTHERN MARIANA ISLANDS": "MP", "NEBRASKA": "NE", "ARIZONA": "AZ", "WISCONSIN": "WI","NORTH DAKOTA": "ND","Armed Forces Europe": "AE", "PENNSYLVANIA": "PA", "OKLAHOMA": "OK", "KENTUCKY": "KY","RHODE ISLAND": "RI","DISTRICT OF COLUMBIA": "DC", "ARKANSAS": "AR", "MISSOURI": "MO", "TEXAS": "TX", "MAINE": "ME"}states = list(state_to_code.keys())print(fuzz.ratio('Python Package', 'PythonPackage'))print(process.extract('Mississippi', states))print(process.extract('Mississipi', states, limit=1))print(process.extractOne('Mississipi', states))data.apply(find_state_code, axis=1)print('Before Correct State:\n', data['state'])data['state'] = data.apply(correct_state, axis=1)print('After Correct State:\n', data['state'])data.insert(5, 'State Code', np.nan)data['State Code'] = data.apply(fill_state_code, axis=1)print(data)# group byprint('==============group by================')print(data.groupby('State Code'))print('All Columns:\n')print(data.groupby('State Code').sum())print('Short Columns:\n')print(data[['State Code', 'Jan', 'Feb', 'Mar', 'total']].groupby('State Code').sum())# 寫入文件data.to_excel('sales_result.xlsx', sheet_name='Sheet1', index=False)這方法好復雜,看來以后要寫下office的筆記了
與50位技術專家面對面20年技術見證,附贈技術全景圖總結
以上是生活随笔為你收集整理的被忽视的fuzzywuzzy库的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 蚂蚁花呗怎么还信用卡
- 下一篇: NLP神器—Gensim