快捷導(dǎo)航

使用Python實現(xiàn)文件查重功能

更新時間：2024年12月30日 11:29:52 作者：hvinsion

這篇文章主要為大家詳細介紹了Python如何通過循環(huán)進行刪除重復(fù)文件,從而達到文件查重功能,文中的示例代碼講解詳細,需要的小伙伴可以參考下

1.簡介

這是一個Python文件去重的工具，市面上很多檢測重復(fù)工具的軟件,都是要付費或者要破解的。于是就想著能不能自己做一個后臺每時每刻都可以自己去重的工具。雖然市面上很多檢測重復(fù)工具的軟件. 但是這個工具使用環(huán)境和那些工具很不一樣。別的工具刪重復(fù)文件，都是掃描，自己刪除.。需要人工值守.。這邊的工具掃描刪除不需要人工值守，并且可以后臺運行。這邊設(shè)置好了,就可以吃著火鍋唱著歌干別的去了。只要電腦不斷電可以一直運行操作。別的軟件必須得等掃描完,才能做下一步操作，這個是本質(zhì)區(qū)別。

2.工具功能

1 通過檢測MD5,來判斷是否有文件重復(fù),檢測過后,可以自行選擇刪除或者不刪除,.或者移動到回收站還是直接刪除。

2 循環(huán)檢測指定目錄的重復(fù)文件,只要是新生成了重復(fù)文件,就會立刻被刪除。

3 刪除默認會保留最早的文件。

**注:**只要文件屬性不一樣，即使文件名一樣也能查重成功。

3.運行效果

4.相關(guān)源碼

import sys, os, filecmp
import time
from win32com.shell import shell,shellcon



def deltorecyclebin(filename):
    #print('deltorecyclebin', filename)
    # os.remove(filename) #直接刪除文件，不經(jīng)過回收站
    res= shell.SHFileOperation((0,shellcon.FO_DELETE,filename,None, shellcon.FOF_SILENT | shellcon.FOF_ALLOWUNDO | shellcon.FOF_NOCONFIRMATION,None,None))  #刪除文件到回收站
    #print(res)
        # if not res[1]:
        #     os.system('del '+filename)

#清理重復(fù)文件
def md5(fname, chunk_size=1024):
    import hashlib
    hash = hashlib.md5()
    with open(fname, "rb") as f:
        chunk = f.read(chunk_size)
        while chunk:
            hash.update(chunk)
            chunk = f.read(chunk_size)
    return hash.hexdigest()

def check_for_duplicates(dufilelist):    
    sizes = {}
    hashes = {}
    duplicates = []
    # Traverse all the files and store their size in a reverse index map
    # 遍歷所有文件并將其大小存儲在反向索引映射中
    files_count = 0
    #print(dufilelist)
    for s in dufilelist:
        for root, _dirs, files in os.walk(s, topdown=True, onerror=None, followlinks=False):
            files_count += len(files)
            for f in files:
                file = os.path.join(root, f)
                size = os.stat(file).st_size
                if size not in sizes:
                    sizes[size] = []
                sizes[size].append(file)
    #print("Traversed {} files".format(files_count))

    # Remove empty files from the size map
    # 從大小映射中刪除空文件
    if 0 in sizes:
        del sizes[0]

    # Remove files with unique sizes from the size map
    # 從大小映射中刪除具有唯一大小的文件
    for (key, value) in list(sizes.items()):
        if len(value) == 1:
            del sizes[key]

    # Traverse the size map and enrich it with hashes
    # 遍歷大小映射并用哈希值豐富它
    for (size, files) in sizes.items():
        for file in files:            
            try:
                hash = md5(file)
            except:
                continue 
            tuple = (size, hash)
            if tuple not in hashes:
                hashes[tuple] = []
            hashes[tuple].append(file)

    # Remove files with unique (size, hash) tuple in hash map
    # 刪除哈希映射中具有唯一（大小，哈希）元組的文件
    for (key, value) in list(hashes.items()):
        if len(value) == 1:
            del hashes[key]

    # Compare file pairs
    # 比較文件對    
    for possible_list in hashes.values():
        #print(possible_list)
        #這里把文件按著時間排序:
        if possible_list:
            # 注意，這里使用lambda表達式，將文件按照最后修改時間順序升序排列
            # os.path.getmtime() 函數(shù)是獲取文件最后修改時間
            # os.path.getctime() 函數(shù)是獲取文件最后創(chuàng)建時間
            possible_list = sorted(possible_list,key=lambda x: os.path.getctime(x))
        

        while possible_list:
            first = possible_list[0]
            copy = [first]
            for other in possible_list[1:]:
                if filecmp.cmp(first, other, shallow=False):
                    copy.append(other)
            for c in copy:
                possible_list.remove(c)
            duplicates.append(copy)

    #print(duplicates)
    # Print duplicates
    # 打印相同  
    groupready=[]
    for _i, group in enumerate(duplicates):
        print("第 " + str(int(_i) + 1) + " 組")
        assert len(group) > 1
        #print("%r:" % (i + 1))
        if onlyprint: 
            for d in group:
                pass
                print("發(fā)現(xiàn)相同文件:  %r" % (d))
            for d in group[1:]:
                groupready.append(d)

            #print("全部要刪除的文件: "+str(groupready))

        else:
            if filedelete:
                for d in group[1:]:
                    os.remove(d)
                    print("直接刪除重復(fù)文件%r" % (d))
            else:
                for d in group[1:]:
                    deltorecyclebin(d)
                    print("回收重復(fù)文件%r" % (d))
                


    if not duplicates:
        print("目錄里沒有找到重復(fù)文件")


    if len(groupready)>0:
        print("--------------------------------分割線------------------------------------------")
        print("下面列出重復(fù)的文件:")
        for num in groupready:
            print(num)

        reback=input("輸入d直接刪除以上的重復(fù)文件, ,輸入r將以上重復(fù)文件放入回收站,,取消請任意輸入n或者關(guān)閉本窗口.請輸入: d/r/n:")
        if reback=="d":
            for num in groupready:
                os.remove(num)
                print("直接刪除重復(fù)文件%r" % (num)) 
        elif reback=="r":
            for num in groupready:
                deltorecyclebin(num)
                print("回收重復(fù)文件%r" % (num)) 
        else: 
            print("取消操作")







if __name__ == "__main__":        
    loopinfo=input("是否循環(huán)檢測重復(fù)文件? y/n:")
    if loopinfo=="y":
        loopinfoable=-1
    else:
        loopinfoable=1

    onlyprintAble=input("只檢測重復(fù)文件請輸入y,檢測并且自動刪除重復(fù)文件請輸入n y/n:")
    if onlyprintAble=="y":
        onlyprint=True   
    else:
        onlyprint=False   
        filedeleteAble=input("重復(fù)文件放入回收站請輸入y ,直接刪除重復(fù)文件請輸入n y/n:")
        if filedeleteAble=="y":
            filedelete=False   #是否強制刪除
        else:
            filedelete=True   #是否強制刪除



    filepath=input("請輸入要檢測重復(fù)文件的目錄:")    

    time_start=time.time()

    while loopinfoable: 
        loopinfoable=loopinfoable-1   
        
        
        print("程序開始...請等待一會...我正在努力為主人干活中o(′^｀)o...")
        dufilelist=[filepath]    
        check_for_duplicates(dufilelist)


        
        #下面是計算時間
        time_end=time.time()
        seconds=time_end-time_start
        m, s = divmod(seconds, 60)
        h, m = divmod(m, 60)
        print('當(dāng)前程序總共用時:%02d:%02d:%02d' % (h, m, s))        
        if loopinfoable!=0: #不循環(huán)
            time.sleep(3)

    input("程序結(jié)束,按回車退出")

到此這篇關(guān)于使用Python實現(xiàn)文件查重功能的文章就介紹到這了,更多相關(guān)Python文件查重內(nèi)容請搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持腳本之家！

您可能感興趣的文章: