當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

yolo标注的数据清洗

發布時間：2025/3/20 编程问答 35 豆豆

生活随笔收集整理的這篇文章主要介紹了 yolo标注的数据清洗小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

文章目錄

原
- 代碼
20200708 無需讀取圖片分辨率直接指定

原

在做yolo標注時，有時我們需要剔除標注中的一些錯誤的標注或者超過一定范圍的標注，

比如我們希望將中心點靠近圖像邊緣Δ距離的標注框給剔除，可以使用以下方法

代碼

# -*- coding: utf-8 -*- """ @File : yolo_annotation_clean.py @Time : 2020/5/13 15:29 @Author : Dontla @Email : sxana@qq.com @Software: PyCharm """ import os import re import shutilimport cv2 import random# 排序函數，對文件列表進行排序(filenames為文件夾文件的文件名的字符串列表，pattern為正則表達式，它是字符串類型) def sort_filenames(filenames, pattern):# （1）可以以len排序，len相同的字符串，會再以0-9排序，能獲得我們想要的結果# filenames.sort(key=len)# （2）這種排序失敗了# filenames.sort(key=lambda x: x[16:])# print(filenames[0][16:])# 1).txt# （3）用lambda配合正則表達式（將filenames中對象一一取出賦給x，通過冒號后的表達式運算后將結果返回給key）# 數字字符串排序貌似還是以字符順序而不是數字大小來排的，可能要先轉化為數字（而re.findall('\((.*?)\)', x)返回的是字符串列表，要把它轉換成數字列表）filenames.sort(key=lambda x: list(map(eval, re.findall(pattern, x))))def extract_content(content_):# 注意，一開始用的第一種，結果只有一行的情況沒有被提取出來，要去掉后面的\n，謹記# content_extract = re.findall('(.*?) (.*?) (.*?) (.*?) (.*?)\n', content)# content_extract = re.findall('(.*?) (.*?) (.*?) (.*?) (.*?)', content)content_extract_ = re.findall('(\d+.?\d*) (\d+.?\d*) (\d+.?\d*) (\d+.?\d*) (\d+.?\d*)', content_)return content_extract_if __name__ == '__main__':# 記得路徑尾部加“/”，不然調用join方法是它會用“\”替代，那樣不好，容易造成轉義字符問題。# ../表示上一層路徑# 以下三個路徑是相對當前文件的source_img_path = './source_img/'source_txt_path = './source_txt/'target_txt_path = './target_txt/'# 讀取source_txt_path路徑下所有文件（包括子文件夾下文件）filenames = os.listdir(source_txt_path)# 調用自定義的sort_filenames函數對filenames重新排序（如果不重新排序它貌似會以1、10、100...的順序排而不是以1、2、3...的順序）# \是轉義字符# pattern = '\((.*?)\)'# Dontla 20200204 現在文件名就是純數字，所以pattern也得改pattern = '(.*?).txt'sort_filenames(filenames, pattern)# print(filenames)# ['1.txt', '2.txt', '3.txt', '4.txt', '5.txt', '6.txt', '7.txt', '8.txt', '9.txt', '10.txt']# 打開文件提取其中數字并將內容重構后寫入新文件for filename in filenames:# 打開文件：with open(os.path.join(source_txt_path, filename), 'r', encoding='utf-8') as f:# 讀取文本文件全部內容content = f.read()# 提取數據content_extract = extract_content(content)# print(content_extract)# [('0', '0.631250', '0.270833', '0.156250', '0.277778'), ('0', '0.372656', '0.861111', '0.156250', '0.277778'), ('0', '0.448437', '0.447222', '0.156250', '0.277778'), ('0', '0.837500', '0.637500', '0.156250', '0.277778'), ('0', '0.155469', '0.268056', '0.156250', '0.277778')]# ...# 獲取當前圖片分辨率信息（這樣不論圖片尺寸多少都能成功轉換）（re.findall()返回的是列表，需要將它轉換成字符串）# 讀取圖片img = cv2.imread('{}{}.jpg'.format(source_img_path, ''.join(re.findall('(.*?).txt', filename))))# 獲取圖片分辨率img_width = img.shape[1]img_height = img.shape[0]# print(img_height, img_width) # 720 1280# 創建寫入內容變量write_content = ''# 讀取標注框數據for box_tuple in content_extract:# 將元組字符串轉換成列表數字box_evar = list(map(eval, box_tuple))# print(box_evar)# [0, 0.63125, 0.270833, 0.15625, 0.277778]# ...# 映射變量class_id = box_evar[0]x, y = box_evar[1] * img_width, box_evar[2] * img_heightw, h = box_evar[3] * img_width, box_evar[4] * img_height# print(class_id, x, y, w, h)# 0 808.0 194.99975999999998 200.0 200.00016000000002# 【錯誤類篩選】if class_id != 0:print('【類標注錯誤】：')print(filename)print(box_evar)print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),round(y + h / 2), class_id))print('\n')continue# 【中心點超出范圍】 elif x < 0 or x >= img_width or y < 0 or y >= img_height:print('【標注框中心點超出圖片范圍】：')print(filename)print(box_evar)print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),round(y + h / 2), class_id))print('\n')continue# 【標注框頂點超出范圍】elif round(x - w / 2) < 0 \or round(x + w / 2) > img_width \or round(x - w / 2) >= round(x + w / 2) \or round(y - h / 2) < 0 \or round(y + h / 2) > img_height \or round(y - h / 2) >= round(y + h / 2):print('【標注框頂點超出范圍】：')print(filename)print(box_evar)print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),round(y + h / 2), class_id))print('\n')continue# 【框的長寬差太多】elif w / h > 2 or h / w > 2:print('【框的長寬比不合適】：')print(filename)print(box_evar)print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),round(y + h / 2), class_id))print('\n')continue# 【不是方形框】# elif w / h < 0.99:# print('【不是方形框】：')# print(filename)# print(box_evar)# print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),# round(y + h / 2), class_id))# print('\n')# continue# 【框太小或太大（邊長小于80或大于300）】elif w < 80 or w > 300 or h < 80 or h > 300:print('【標注框大小有問題】：')print(filename)print(box_evar)print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),round(y + h / 2), class_id))print('\n')continue# 將篩選后的標注框加入到write_content中：write_content += '{} {} {} {} {}\n'.format(box_evar[0], box_evar[1], box_evar[2], box_evar[3],box_evar[4])# print(filename)# print(write_content)# 【去除空文件】# if write_content == '':# print('空文件：{}'.format(filename))# # print('content：{}'.format(content))# cv2.imwrite('null_img\\{}.jpg'.format(''.join(re.findall('(.*?).txt', filename))), img)# print('將圖片拷貝到“空文件”文件夾')# continue# else:# write_content = write_content.strip()# with open(os.path.join(target_txt_path, filename), 'w', encoding='utf-8') as f2:# f2.write(write_content)# 【不去除空文件】write_content = write_content.strip()with open(os.path.join(target_txt_path, filename), 'w', encoding='utf-8') as f2:f2.write(write_content)

20200708 無需讀取圖片分辨率直接指定

# -*- coding: utf-8 -*- """ @File : yolo_annotation_clean.py @Time : 2020/5/13 15:29 @Author : Dontla @Email : sxana@qq.com @Software: PyCharm """ import os import re import shutilimport cv2 import random# 排序函數，對文件列表進行排序(filenames為文件夾文件的文件名的字符串列表，pattern為正則表達式，它是字符串類型) def sort_filenames(filenames, pattern):# （1）可以以len排序，len相同的字符串，會再以0-9排序，能獲得我們想要的結果# filenames.sort(key=len)# （2）這種排序失敗了# filenames.sort(key=lambda x: x[16:])# print(filenames[0][16:])# 1).txt# （3）用lambda配合正則表達式（將filenames中對象一一取出賦給x，通過冒號后的表達式運算后將結果返回給key）# 數字字符串排序貌似還是以字符順序而不是數字大小來排的，可能要先轉化為數字（而re.findall('\((.*?)\)', x)返回的是字符串列表，要把它轉換成數字列表）filenames.sort(key=lambda x: list(map(eval, re.findall(pattern, x))))def extract_content(content_):# 注意，一開始用的第一種，結果只有一行的情況沒有被提取出來，要去掉后面的\n，謹記# content_extract = re.findall('(.*?) (.*?) (.*?) (.*?) (.*?)\n', content)# content_extract = re.findall('(.*?) (.*?) (.*?) (.*?) (.*?)', content)content_extract_ = re.findall('(\d+.?\d*) (\d+.?\d*) (\d+.?\d*) (\d+.?\d*) (\d+.?\d*)', content_)return content_extract_if __name__ == '__main__':# 記得路徑尾部加“/”，不然調用join方法是它會用“\”替代，那樣不好，容易造成轉義字符問題。# ../表示上一層路徑# 以下三個路徑是相對當前文件的source_txt_path = './source_txt/'target_txt_path = './target_txt/'# 獲取圖片分辨率img_width = 1280img_height = 720# 錯誤標注計數器error_boxs_num = 0# 讀取source_txt_path路徑下所有文件（包括子文件夾下文件）filenames = os.listdir(source_txt_path)# 調用自定義的sort_filenames函數對filenames重新排序（如果不重新排序它貌似會以1、10、100...的順序排而不是以1、2、3...的順序）# \是轉義字符# pattern = '\((.*?)\)'# Dontla 20200204 現在文件名就是純數字，所以pattern也得改pattern = '(.*?).txt'sort_filenames(filenames, pattern)# print(filenames)# ['1.txt', '2.txt', '3.txt', '4.txt', '5.txt', '6.txt', '7.txt', '8.txt', '9.txt', '10.txt']# 打開文件提取其中數字并將內容重構后寫入新文件for filename in filenames:# 打開文件：with open(os.path.join(source_txt_path, filename), 'r', encoding='utf-8') as f:# 讀取文本文件全部內容content = f.read()# 提取數據content_extract = extract_content(content)# 創建寫入內容變量write_content = ''# 讀取標注框數據for box_tuple in content_extract:# 將元組字符串轉換成列表數字box_evar = list(map(eval, box_tuple))# print(box_evar)# [0, 0.63125, 0.270833, 0.15625, 0.277778]# ...# 映射變量class_id = box_evar[0]x, y = box_evar[1] * img_width, box_evar[2] * img_heightw, h = box_evar[3] * img_width, box_evar[4] * img_height# print(class_id, x, y, w, h)# 0 808.0 194.99975999999998 200.0 200.00016000000002# 【錯誤類篩選】if class_id != 0:error_boxs_num += 1print('【類標注錯誤】：')print(filename)print(box_evar)print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),round(y + h / 2), class_id))print('\n')continue# 【中心點超出范圍】 elif x < 0 or x >= img_width or y < 0 or y >= img_height:error_boxs_num += 1print('【標注框中心點超出圖片范圍】：')print(filename)print(box_evar)print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),round(y + h / 2), class_id))print('\n')continue# 【標注框頂點超出范圍】elif round(x - w / 2) < 0 \or round(x + w / 2) > img_width \or round(x - w / 2) >= round(x + w / 2) \or round(y - h / 2) < 0 \or round(y + h / 2) > img_height \or round(y - h / 2) >= round(y + h / 2):error_boxs_num += 1print('【標注框頂點超出范圍】：')print(filename)print(box_evar)print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),round(y + h / 2), class_id))print('\n')continue# 【框的長寬差太多】elif w / h > 2 or h / w > 2:error_boxs_num += 1print('【框的長寬比不合適】：')print(filename)print(box_evar)print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),round(y + h / 2), class_id))print('\n')continue# 【框太小或太大（邊長小于80或大于300）】elif w < 100 or w > 300 or h < 100 or h > 300:error_boxs_num += 1print('【標注框大小有問題】：')print(filename)print(box_evar)print('[{}, {}, {}, {}, {}]'.format(round(x - w / 2), round(y - h / 2), round(x + w / 2),round(y + h / 2), class_id))print('\n')continue# 將篩選后的標注框加入到write_content中：write_content += '{} {} {} {} {}\n'.format(box_evar[0], box_evar[1], box_evar[2], box_evar[3],box_evar[4])# 不去除空文件write_content = write_content.strip()with open(os.path.join(target_txt_path, filename), 'w', encoding='utf-8') as f2:f2.write(write_content)# 打印錯誤標注框數量print('錯誤標注框數量：{}'.format(error_boxs_num))

結果：

總結

以上是生活随笔為你收集整理的yolo标注的数据清洗的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

数据
Yolo

编程问答

yolo标注的数据清洗

文章目錄

原

代碼

20200708 無需讀取圖片分辨率 直接指定

總結

20200708 無需讀取圖片分辨率直接指定