基于Python開發(fā)圖像數(shù)據(jù)清洗&圖像質(zhì)量檢查工具
隨著目前采集的數(shù)據(jù)集中的圖像越來(lái)越多,出現(xiàn)了數(shù)據(jù)格式十分雜亂、質(zhì)量不統(tǒng)一、部分圖像存在損壞等各種問(wèn)題。
本程序提供各種圖像數(shù)據(jù)清洗和圖像質(zhì)量檢查功能,防止模型訓(xùn)練加載數(shù)據(jù)時(shí)出現(xiàn)各種異常。
1.使用各種方式讀取圖像,用于檢查圖像是否損壞
2.讀取圖像exit信息,用于防止標(biāo)注異常
3.記錄圖像信息
① 圖像編碼格式、分辨率、通道數(shù)、文件大小,便于判斷圖像其他屬性
② MD5,PHash16等值,用于判斷是否存在重復(fù)
③ 峰值信噪比(PSNR)、結(jié)構(gòu)相似性(SSIM)等,用于判斷圖像質(zhì)量
完整代碼
#!/usr/bin/env python # -*- encoding: utf-8 -*- # 功能:圖像數(shù)據(jù)清洗&圖像質(zhì)量檢查 # 作者:AYangSN # 時(shí)間:2025-03-12 # 版本:1.0 # here is important imports import csv import os import sys import glob import shutil import argparse import cv2 import hashlib import imagehash import numpy as np from tqdm import tqdm from PIL import Image, ImageOps, ExifTags import pandas as pd from concurrent.futures import ThreadPoolExecutor, as_completed from skimage.metrics import structural_similarity as ssim from scipy.stats import entropy def check_image_with_pil(filepath): """使用PIL檢查圖像是否損壞""" try: img = Image.open(filepath) img.verify() # 驗(yàn)證圖像完整性 img = Image.open(filepath) # 再次打開以確保圖像可以正常加載 return True, img except Exception as e: return False, str(e) def check_image_with_opencv(filepath): """使用OpenCV檢查圖像是否損壞""" try: image = cv2.imread(filepath) if image is None or image.size == 0: return False, "OpenCV無(wú)法加載圖像" return True, image except Exception as e: return False, str(e) def check_file_header(filepath): """通過(guò)讀取文件頭信息檢查圖像格式是否正確""" valid_headers = { 'JPEG': b'\xff\xd8\xff', 'PNG': b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a', 'GIF87a': b'GIF87a', 'GIF89a': b'GIF89a', 'BMP': b'BM' } with open(filepath, 'rb') as f: header = f.read(8) # 讀取前8個(gè)字節(jié)以覆蓋所有格式 for format, magic in valid_headers.items(): if header.startswith(magic): return True, None return False, "未知的文件頭" def get_exif_orientation(image): try: exif = image._getexif() except AttributeError: exif = None if exif is None: return None exif = { ExifTags.TAGS[k]: v for k, v in exif.items() if k in ExifTags.TAGS } # 獲取圖像方向信息 orientation = exif.get('Orientation', None) return orientation def exif_update_image_files(image, orientation, image_file, output_dir): '''根據(jù)參數(shù)旋轉(zhuǎn)圖片''' if orientation == 2: # left-to-right mirror image = ImageOps.mirror(image) elif orientation == 3: # rotate 180 image = image.transpose(Image.ROTATE_180) elif orientation == 4: # top-to-bottom mirror image = ImageOps.flip(image) elif orientation == 5: # top-to-left mirror image = ImageOps.mirror(image.transpose(Image.ROTATE_270)) elif orientation == 6: # rotate 270 image = image.transpose(Image.ROTATE_270) elif orientation == 7: # top-to-right mirror image = ImageOps.mirror(image.transpose(Image.ROTATE_90)) elif orientation == 8: # rotate 90 image = image.transpose(Image.ROTATE_90) else: pass # 生成輸出路徑 outpath = "{}/{}".format(output_dir, orientation) os.makedirs(outpath, exist_ok=True) # 使用opencv讀取,去除exif信息 img = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR) # 獲取圖像名 _, imgname = os.path.split(image_file) # 重新保存圖片 cv2.imwrite(outpath+'/'+imgname, img) def compute_md5(filepath): """計(jì)算文件的MD5值""" hash_md5 = hashlib.md5() with open(filepath, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) return hash_md5.hexdigest() def compute_phash(imgpath, hash_size=16): # 計(jì)算圖像的phash值 img = Image.open(imgpath) phash = imagehash.phash(img, hash_size=hash_size, highfreq_factor=4) hex_string = str(phash) return hex_string def diff_phash(p1, p2, hash_size = 8): # 計(jì)算兩個(gè)phash值之間的相似度差異 return (p1 - p2) / hash_size ** 2 def check_blur(image, ref_image=None): """綜合評(píng)估圖像的模糊質(zhì)量""" gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # Laplacian 方差 laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var() # 傅里葉變換 f = np.fft.fft2(gray) fshift = np.fft.fftshift(f) magnitude_spectrum = 20 * np.log(np.abs(fshift)) fourier_energy = np.sum(magnitude_spectrum) / (magnitude_spectrum.shape[0] * magnitude_spectrum.shape[1]) # Tenengrad 方法 gradient_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3) gradient_y = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3) gradient_magnitude = np.sqrt(gradient_x**2 + gradient_y**2) tenengrad_value = np.mean(gradient_magnitude) # 熵 hist = cv2.calcHist([gray], [0], None, [256], [0, 256]) hist_norm = hist.ravel() / hist.max() entropy_value = entropy(hist_norm, base=2) # SSIM(如果有參考圖像) ssim_score = None if ref_image is not None: gray_ref = cv2.cvtColor(ref_image, cv2.COLOR_BGR2GRAY) ssim_score, _ = ssim(gray, gray_ref, full=True) return laplacian_var, fourier_energy, tenengrad_value, entropy_value, ssim_score def process_images(filepath, output_dir): # 獲取文件擴(kuò)展名 file_extension = os.path.splitext(filepath)[1].lower() # 檢查圖像是否損壞 pil_result, img_pil = check_image_with_pil(filepath) opencv_result, img_opencv = check_image_with_opencv(filepath) header_result, header_error = check_file_header(filepath) # 如果圖像沒(méi)有損壞,則繼續(xù)處理 if pil_result and opencv_result and header_result: # 獲取文件大小 字節(jié)(bytes) file_size = os.path.getsize(filepath) # 獲取分辨率 width, height = img_pil.size # 獲取顏色模式 color_mode = img_pil.mode # 獲取位深度 bit_depth = img_pil.bits if hasattr(img_pil, 'bits') else None # 獲取通道數(shù) channels = len(color_mode) if isinstance(color_mode, str) else None # 獲取壓縮類型 compression = img_pil.info.get('compression', 'Unknown') # 獲取EXIF數(shù)據(jù) orientation = get_exif_orientation(img_pil) # 根據(jù)旋轉(zhuǎn)信息更新圖像 if not (orientation is None or orientation==1): exif_update_image_files(img_pil, orientation, filepath, os.path.join(output_dir,'exif')) # 計(jì)算MD5校驗(yàn)碼 md5_checksum = compute_md5(filepath) # 計(jì)算phash16校驗(yàn)碼 hex_string = compute_phash(filepath, hash_size=16) # # 獲取直方圖 # hist = img_pil.histogram() laplacian_var, fourier_energy, tenengrad_value, entropy_value, ssim_score = check_blur(img_opencv) log_entry = { 'filename': filepath, 'file_extension': file_extension, 'pil_check': pil_result, 'opencv_check': opencv_result, 'header_check': header_result, 'header_error': header_error, 'file_size': file_size, 'resolution': (width, height), 'color_mode': color_mode, 'bit_depth': bit_depth, 'channels': channels, 'compression': compression, 'exif_data': orientation, 'md5_checksum': md5_checksum, 'phash16_checksum': hex_string, 'laplacian_var': laplacian_var, 'fourier_energy': fourier_energy, 'tenengrad_value': tenengrad_value, 'entropy_value': entropy_value, 'ssim_score': ssim_score } else: log_entry = { 'filename': filepath, 'file_extension': file_extension, 'pil_check': pil_result, 'opencv_check': opencv_result, 'header_check': header_result, 'header_error': header_error, } # 將損壞的文件復(fù)制到指定的輸出目錄下 shutil.copy(filepath, os.path.join(output_dir, 'broken')) # 輸出結(jié)果 print(f"文件名: {filepath}") print(f"PIL檢查: {'成功' if pil_result else '失敗'}") print(f"OpenCV檢查: {'成功' if opencv_result else '失敗'}") print(f"文件頭檢查: {'成功' if header_result else '失敗'} - {header_error}") print("-" * 40) return log_entry def write_to_csv(log_entries, output_path): fieldnames = [ 'filename', 'file_extension', 'pil_check', 'opencv_check', 'header_check', 'header_error', \ 'file_size', 'resolution', 'color_mode', 'bit_depth','channels', 'compression', 'exif_data', 'md5_checksum', 'phash16_checksum', \ 'laplacian_var', 'fourier_energy', 'tenengrad_value', 'entropy_value', 'ssim_score' ] mode = 'a' if os.path.exists(output_path) else 'w' with open(output_path, mode, newline='', encoding='utf-8-sig') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) if mode == 'w': writer.writeheader() for entry in log_entries: writer.writerow(entry) def main(input_dir, output_dir): os.makedirs(output_dir, exist_ok=True) output_csv_path = os.path.join(output_dir, 'image_integrity_report.csv') filepaths = [] # 遍歷輸入目錄下的所有文件,包括子目錄 for root, dir, fs in tqdm(os.walk(input_dir), desc='Processing Images...'): filepaths.extend([os.path.join(root, f) for f in fs if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff'))]) print(f"Found {len(filepaths)} images to process.") # 使用線程池進(jìn)行并行處理 batch_size = 100 # 每次處理的批大小 with ThreadPoolExecutor(max_workers=4) as executor: futures = {executor.submit(process_images, fp, output_dir): fp for fp in filepaths} processed_entries = [] for future in tqdm(as_completed(futures), desc='Writing CSV...'): try: log_entry = future.result() processed_entries.append(log_entry) # print(f"log_entry: {log_entry}") # 當(dāng)達(dá)到批次大小時(shí)寫入CSV if len(processed_entries) >= batch_size: write_to_csv(processed_entries, output_csv_path) processed_entries.clear() except Exception as exc: print(f'{futures[future]} generated an exception: {exc}') # 寫入剩余的數(shù)據(jù) if processed_entries: write_to_csv(processed_entries, output_csv_path) print("報(bào)告已生成.") if __name__ == "__main__": # 示例用法 input_directory = "your_inputpath" output_directory = "your_outputpath" main(input_directory, output_directory)
到此這篇關(guān)于基于Python開發(fā)圖像數(shù)據(jù)清洗&圖像質(zhì)量檢查工具的文章就介紹到這了,更多相關(guān)Python圖像數(shù)據(jù)清洗和質(zhì)量檢查內(nèi)容請(qǐng)搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持腳本之家!
相關(guān)文章
Django makemigrations migrate執(zhí)行成功但不創(chuàng)建數(shù)據(jù)庫(kù)表的解決
這篇文章主要介紹了Django makemigrations migrate執(zhí)行成功但不創(chuàng)建數(shù)據(jù)庫(kù)表的解決方案,具有很好的參考價(jià)值,希望對(duì)大家有所幫助,如有錯(cuò)誤或未考慮完全的地方,望不吝賜教2023-09-09Python對(duì)DataFrame中所有行數(shù)據(jù)進(jìn)行比較的幾種方法
在數(shù)據(jù)分析中,比較DataFrame數(shù)據(jù)框中的行是一項(xiàng)基本操作,可應(yīng)用于多種場(chǎng)景,在本文中,我們將學(xué)習(xí)各種方法,將DataFrame中的行與每一行進(jìn)行比較,直到所有行都被比較完,結(jié)果存儲(chǔ)在一個(gè)列表中,需要的朋友可以參考下2024-12-12Gradio機(jī)器學(xué)習(xí)模型快速部署工具應(yīng)用分享前篇
這篇文章主要為大家介紹了Gradio機(jī)器學(xué)習(xí)模型快速部署工具應(yīng)用分享前篇,有需要的朋友可以借鑒參考下,希望能夠有所幫助,祝大家多多進(jìn)步,早日升職加薪2023-04-04機(jī)器學(xué)習(xí)10大經(jīng)典算法詳解
這篇文章主要為大家詳細(xì)介紹了機(jī)器學(xué)習(xí)10大經(jīng)典算法,具有一定的參考價(jià)值,感興趣的小伙伴們可以參考一下2017-12-12python使用tomorrow實(shí)現(xiàn)多線程的例子
今天小編就為大家分享一篇python使用tomorrow實(shí)現(xiàn)多線程的例子,具有很好的參考價(jià)值,希望對(duì)大家有所幫助。一起跟隨小編過(guò)來(lái)看看吧2019-07-07簡(jiǎn)單了解pytest測(cè)試框架setup和tearDown
這篇文章主要介紹了簡(jiǎn)單了解pytest測(cè)試框架setup和tearDown,文中通過(guò)示例代碼介紹的非常詳細(xì),對(duì)大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值,需要的朋友可以參考下2020-04-04Python批量實(shí)現(xiàn)橫屏轉(zhuǎn)豎屏的視頻處理工具
這篇文章主要為大家詳細(xì)介紹了如何使用Python和Tkinter框架開發(fā)一個(gè)視頻處理器應(yīng)用,用于批量橫屏轉(zhuǎn)豎屏視頻處理,支持多種視頻格式和編碼選擇,需要的可以了解下2025-02-02