快捷導(dǎo)航

基于Python開發(fā)圖像數(shù)據(jù)清洗&圖像質(zhì)量檢查工具

更新時(shí)間：2025年03月13日 08:34:59 作者：AYangSN

隨著目前采集的數(shù)據(jù)集中的圖像越來(lái)越多,出現(xiàn)了數(shù)據(jù)格式十分雜亂、質(zhì)量不統(tǒng)一等問(wèn)題,下面小編就來(lái)用Python制作一個(gè)圖像數(shù)據(jù)清洗&圖像質(zhì)量檢查工具吧

隨著目前采集的數(shù)據(jù)集中的圖像越來(lái)越多，出現(xiàn)了數(shù)據(jù)格式十分雜亂、質(zhì)量不統(tǒng)一、部分圖像存在損壞等各種問(wèn)題。

本程序提供各種圖像數(shù)據(jù)清洗和圖像質(zhì)量檢查功能，防止模型訓(xùn)練加載數(shù)據(jù)時(shí)出現(xiàn)各種異常。

1.使用各種方式讀取圖像，用于檢查圖像是否損壞

2.讀取圖像exit信息，用于防止標(biāo)注異常

3.記錄圖像信息

① 圖像編碼格式、分辨率、通道數(shù)、文件大小，便于判斷圖像其他屬性

② MD5，PHash16等值，用于判斷是否存在重復(fù)

③ 峰值信噪比(PSNR)、結(jié)構(gòu)相似性(SSIM)等，用于判斷圖像質(zhì)量

完整代碼

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# 功能：圖像數(shù)據(jù)清洗&圖像質(zhì)量檢查
# 作者：AYangSN
# 時(shí)間：2025-03-12
# 版本：1.0


# here is important imports
import csv
import os
import sys
import glob
import shutil
import argparse
import cv2
import hashlib
import imagehash
import numpy as np
from tqdm import tqdm
from PIL import Image, ImageOps, ExifTags
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from skimage.metrics import structural_similarity as ssim
from scipy.stats import entropy


def check_image_with_pil(filepath):
    """使用PIL檢查圖像是否損壞"""
    try:
        img = Image.open(filepath)
        img.verify()  # 驗(yàn)證圖像完整性
        img = Image.open(filepath)  # 再次打開以確保圖像可以正常加載
        return True, img
    except Exception as e:
        return False, str(e)

def check_image_with_opencv(filepath):
    """使用OpenCV檢查圖像是否損壞"""
    try:
        image = cv2.imread(filepath)
        if image is None or image.size == 0:
            return False, "OpenCV無(wú)法加載圖像"
        return True, image
    except Exception as e:
        return False, str(e)

def check_file_header(filepath):
    """通過(guò)讀取文件頭信息檢查圖像格式是否正確"""
    valid_headers = {
        'JPEG': b'\xff\xd8\xff',
        'PNG': b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a',
        'GIF87a': b'GIF87a',
        'GIF89a': b'GIF89a',
        'BMP': b'BM'
    }
    with open(filepath, 'rb') as f:
        header = f.read(8)  # 讀取前8個(gè)字節(jié)以覆蓋所有格式
        for format, magic in valid_headers.items():
            if header.startswith(magic):
                return True, None
    return False, "未知的文件頭"


def get_exif_orientation(image):
    try:
        exif = image._getexif()
    except AttributeError:
        exif = None
    if exif is None:
        return None
    exif = {
        ExifTags.TAGS[k]: v
        for k, v in exif.items()
        if k in ExifTags.TAGS
    }
    # 獲取圖像方向信息
    orientation = exif.get('Orientation', None)
    return orientation


def exif_update_image_files(image, orientation, image_file, output_dir):
    '''根據(jù)參數(shù)旋轉(zhuǎn)圖片'''
    if orientation == 2:
        # left-to-right mirror
        image = ImageOps.mirror(image)
    elif orientation == 3:
        # rotate 180
        image = image.transpose(Image.ROTATE_180)
    elif orientation == 4:
        # top-to-bottom mirror
        image = ImageOps.flip(image)
    elif orientation == 5:
        # top-to-left mirror
        image = ImageOps.mirror(image.transpose(Image.ROTATE_270))
    elif orientation == 6:
        # rotate 270
        image = image.transpose(Image.ROTATE_270)
    elif orientation == 7:
        # top-to-right mirror
        image =  ImageOps.mirror(image.transpose(Image.ROTATE_90))
    elif orientation == 8:
        # rotate 90
        image = image.transpose(Image.ROTATE_90)
    else:
        pass
    
    # 生成輸出路徑
    outpath = "{}/{}".format(output_dir, orientation)
    os.makedirs(outpath, exist_ok=True)

    # 使用opencv讀取，去除exif信息
    img = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR)

    # 獲取圖像名
    _, imgname = os.path.split(image_file)
    
    # 重新保存圖片
    cv2.imwrite(outpath+'/'+imgname, img)


def compute_md5(filepath):
    """計(jì)算文件的MD5值"""
    hash_md5 = hashlib.md5()
    with open(filepath, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()


def compute_phash(imgpath, hash_size=16):
    # 計(jì)算圖像的phash值
    img = Image.open(imgpath)
    phash = imagehash.phash(img, hash_size=hash_size, highfreq_factor=4)
    hex_string = str(phash)
    return hex_string


def diff_phash(p1, p2, hash_size = 8):
    # 計(jì)算兩個(gè)phash值之間的相似度差異
    return (p1 - p2) / hash_size ** 2


def check_blur(image, ref_image=None):
    """綜合評(píng)估圖像的模糊質(zhì)量"""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Laplacian 方差
    laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()

    # 傅里葉變換
    f = np.fft.fft2(gray)
    fshift = np.fft.fftshift(f)
    magnitude_spectrum = 20 * np.log(np.abs(fshift))
    fourier_energy = np.sum(magnitude_spectrum) / (magnitude_spectrum.shape[0] * magnitude_spectrum.shape[1])

    # Tenengrad 方法
    gradient_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
    gradient_y = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
    gradient_magnitude = np.sqrt(gradient_x**2 + gradient_y**2)
    tenengrad_value = np.mean(gradient_magnitude)

    # 熵
    hist = cv2.calcHist([gray], [0], None, [256], [0, 256])
    hist_norm = hist.ravel() / hist.max()
    entropy_value = entropy(hist_norm, base=2)

    # SSIM（如果有參考圖像）
    ssim_score = None
    if ref_image is not None:
        gray_ref = cv2.cvtColor(ref_image, cv2.COLOR_BGR2GRAY)
        ssim_score, _ = ssim(gray, gray_ref, full=True)

    return laplacian_var, fourier_energy, tenengrad_value, entropy_value, ssim_score


def process_images(filepath, output_dir):
    # 獲取文件擴(kuò)展名
    file_extension = os.path.splitext(filepath)[1].lower()

    # 檢查圖像是否損壞
    pil_result, img_pil = check_image_with_pil(filepath)
    opencv_result, img_opencv = check_image_with_opencv(filepath)
    header_result, header_error = check_file_header(filepath)

    # 如果圖像沒(méi)有損壞，則繼續(xù)處理
    if pil_result and opencv_result and header_result:
        
        # 獲取文件大小  字節(jié)（bytes）
        file_size = os.path.getsize(filepath)
        
        # 獲取分辨率
        width, height = img_pil.size
        
        # 獲取顏色模式
        color_mode = img_pil.mode
        
        # 獲取位深度
        bit_depth = img_pil.bits if hasattr(img_pil, 'bits') else None
        
        # 獲取通道數(shù)
        channels = len(color_mode) if isinstance(color_mode, str) else None
        
        # 獲取壓縮類型
        compression = img_pil.info.get('compression', 'Unknown')
        
        # 獲取EXIF數(shù)據(jù)
        orientation = get_exif_orientation(img_pil)
        
        # 根據(jù)旋轉(zhuǎn)信息更新圖像
        if not (orientation is None or orientation==1):
            exif_update_image_files(img_pil, orientation, filepath, os.path.join(output_dir,'exif'))

        # 計(jì)算MD5校驗(yàn)碼
        md5_checksum = compute_md5(filepath)

        # 計(jì)算phash16校驗(yàn)碼
        hex_string = compute_phash(filepath, hash_size=16)

        # # 獲取直方圖
        # hist = img_pil.histogram()

        laplacian_var, fourier_energy, tenengrad_value, entropy_value, ssim_score = check_blur(img_opencv)

        log_entry = {
            'filename': filepath,
            'file_extension': file_extension,
            'pil_check': pil_result,
            'opencv_check': opencv_result,
            'header_check': header_result,
            'header_error': header_error,
            'file_size': file_size,
            'resolution': (width, height),
            'color_mode': color_mode,
            'bit_depth': bit_depth,
            'channels': channels,
            'compression': compression,
            'exif_data': orientation,
            'md5_checksum': md5_checksum,
            'phash16_checksum': hex_string,
            'laplacian_var': laplacian_var,
            'fourier_energy': fourier_energy,
            'tenengrad_value': tenengrad_value,
            'entropy_value': entropy_value,
            'ssim_score': ssim_score
        }
    else:
        log_entry = {
            'filename': filepath,
            'file_extension': file_extension,
            'pil_check': pil_result,
            'opencv_check': opencv_result,
            'header_check': header_result,
            'header_error': header_error,

        }
        # 將損壞的文件復(fù)制到指定的輸出目錄下
        shutil.copy(filepath, os.path.join(output_dir, 'broken'))

    # 輸出結(jié)果
    print(f"文件名: {filepath}")
    print(f"PIL檢查: {'成功' if pil_result else '失敗'}")
    print(f"OpenCV檢查: {'成功' if opencv_result else '失敗'}")
    print(f"文件頭檢查: {'成功' if header_result else '失敗'} - {header_error}")
    print("-" * 40)

    return log_entry


def write_to_csv(log_entries, output_path):
    fieldnames = [
        'filename', 'file_extension', 'pil_check', 'opencv_check', 'header_check', 'header_error', \
        'file_size', 'resolution', 'color_mode', 'bit_depth','channels', 'compression', 'exif_data', 'md5_checksum', 'phash16_checksum', \
        'laplacian_var', 'fourier_energy', 'tenengrad_value', 'entropy_value', 'ssim_score'
    ]
    mode = 'a' if os.path.exists(output_path) else 'w'
    with open(output_path, mode, newline='', encoding='utf-8-sig') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        if mode == 'w':
            writer.writeheader()
        for entry in log_entries:
            writer.writerow(entry)


def main(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    output_csv_path = os.path.join(output_dir, 'image_integrity_report.csv')

    filepaths = []
    # 遍歷輸入目錄下的所有文件，包括子目錄
    for root, dir, fs in tqdm(os.walk(input_dir), desc='Processing Images...'):
        filepaths.extend([os.path.join(root, f) for f in fs if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff'))])
    print(f"Found {len(filepaths)} images to process.")

    # 使用線程池進(jìn)行并行處理
    batch_size = 100  # 每次處理的批大小
    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = {executor.submit(process_images, fp, output_dir): fp for fp in filepaths}
        processed_entries = []
        for future in tqdm(as_completed(futures), desc='Writing CSV...'):
            try:
                log_entry = future.result()
                processed_entries.append(log_entry)
                # print(f"log_entry: {log_entry}")
                # 當(dāng)達(dá)到批次大小時(shí)寫入CSV
                if len(processed_entries) >= batch_size:
                    write_to_csv(processed_entries, output_csv_path)
                    processed_entries.clear()
            except Exception as exc:
                print(f'{futures[future]} generated an exception: {exc}')

        # 寫入剩余的數(shù)據(jù)
        if processed_entries:
            write_to_csv(processed_entries, output_csv_path)

    print("報(bào)告已生成.")


if __name__ == "__main__":
    # 示例用法
    input_directory = "your_inputpath"
    output_directory = "your_outputpath"
    main(input_directory, output_directory)

到此這篇關(guān)于基于Python開發(fā)圖像數(shù)據(jù)清洗&圖像質(zhì)量檢查工具的文章就介紹到這了,更多相關(guān)Python圖像數(shù)據(jù)清洗和質(zhì)量檢查內(nèi)容請(qǐng)搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持腳本之家！

您可能感興趣的文章: