-
Notifications
You must be signed in to change notification settings - Fork 0
/
hash-and-delete.py
210 lines (170 loc) · 7.56 KB
/
hash-and-delete.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# Written by Richard Kirchofer.
# How to hash a file.
# http://www.pythoncentral.io/finding-duplicate-files-with-python/
# A function to convert bytes into something more readable.
# http://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
import os
import sys
import hashlib
# os.makedirs(dir) to make a dir
# os.stat(path).st_size for file size
# os.remove(path)
# os.rename(path)
# hashfile source
# http://www.pythoncentral.io/finding-duplicate-files-with-python/
# path <str>, blocksize <int>
def hashfile(path, blocksize = 65536):
"Hashfile - returns the md5sum of a file by reading it in chunks of 65536 bytes."
infile = open(path, 'rb')
hasher = hashlib.md5()
buf = infile.read(blocksize)
while len(buf) > 0:
hasher.update(buf)
buf = infile.read(blocksize)
infile.close()
return hasher.hexdigest()
# http://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
def sizeof_fmt(num, suffix='B'):
"Represent sizes in a human readable format. Using 1024 convention."
for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
if abs(num) < 1024.0:
return "%3.1f%s%s" % (num, unit, suffix)
num /= 1024.0
return "%.1f%s%s" % (num, 'Yi', suffix)
#***********code above here can be considered golden
# returns a string that is the dir to use
# http://www.tutorialspoint.com/python/os_access.htm
# F_OK to test the existence of a path
# R_OK, W_OK, X_OK; read, write, execute
def choose_root_directory():
"Prompts the user for what dir to use as starting dir."
print "default dir is current dir (./)"
path = str(raw_input("enter alternate dir: "))
if path:
if os.access(path, os.F_OK):
print "path exists"
if not os.access(path, os.R_OK):
print "test for read privileges failed"
# if not os.access(path, os.W_OK):
# print "test for write privileges failed"
# print "write privaleges not needed"
# if not os.access(path, os.X_OK):
# print "test for execute privileges failed"
# print "execute privileges not needed"
return path
# os.access (returns true if access is allowed and false if it is not)
elif os.access(os.path.dirname(str(path )), os.W_OK):
print "path does not exist but write privileges are given"
return str(path)
else:
print "error, invalid path"
print "must have write privileges"
else:
print "using default dir (./)"
return "./"
# we want to make sure that the input is valid (not out of range)
# also, if they choose to delete all then make sure they really intend to delete all
# uses: check_inputs_in_range()
# input_variable_type <str>, length <int>
def duplicate_check_input_dialogue(length):
while True:
ones_to_delete = [int(x) for x in raw_input("Enter the numbers of the ones that you want to mark for deletion separated by spaces.\n").strip().split()]
if not ones_to_delete:
print "You didn't enter anything. Skipping..."
return []
if not check_inputs_in_range(ones_to_delete, length):
print "Your input was invalid."
continue
if len(ones_to_delete) == length:
are_you_sure = raw_input("You have selected to delete all of the files. Are you sure you want to do this?\n (y/n) ")
# print type(are_you_sure)
if are_you_sure != "y":
continue
return ones_to_delete
# returns false if any of the inputs were invalid
def check_inputs_in_range(ones_to_delete, length):
for x in ones_to_delete:
if int(x) > int(length)-1:
print "Your inputs must be less than", length, "."
return False
elif int(x) < 0:
print "Your inputs must be greater than 0."
return False
return True
def print_list_of_dir_tups(list_of_dirs_tups):
if not list_of_dirs_tups:
return
max_level = len(list_of_dirs_tups[0])
for item in list_of_dirs_tups:
if len(item) < max_level:
max_level = len(item)
fname = os.path.join(item[0], item[1])
fname = fname.split('/')
# print index, fname, os.stat(fname).st_size
if __name__ == '__main__':
root_directory = choose_root_directory()
all_hashes_once = {}
all_duplicates = {}
duplicate_size = 0
duplicate_count = 0
#########################################
# walk through the dirs, hash the files #
#########################################
# os.walk will return a tuple
# you may modify os.walk()[1] to change which dirs it decends into
for current_dir, dirs_in_current_dir, files_in_current_dir in os.walk(root_directory):
print "checking", current_dir
# exclude dirs that begin with a period
dirs_in_current_dir[:] = [x for x in dirs_in_current_dir if not x[0] == "."]
# hash the paths and add it to all_hashes_once if it's new or to all_duplicates if it's a duplicate
# the files are stored as tuples of the dir and file name
for file_name in files_in_current_dir:
path = os.path.join(current_dir, file_name)
file_hash = hashfile(path)
if file_hash in all_hashes_once:
size = int(os.stat(path).st_size)
duplicate_size += size
duplicate_count += 1
print ">", sizeof_fmt(size), "\t", file_name
# add the thing to the right list
if file_hash in all_duplicates:
all_duplicates[file_hash].append((current_dir, file_name))
else:
all_duplicates[file_hash] = [all_hashes_once[file_hash], (current_dir, file_name)]
else:
all_hashes_once[file_hash] = (current_dir, file_name)
print "Found", duplicate_count, "duplicate file(s). Duplicate space", sizeof_fmt(duplicate_size)
##################################################################
# print each set of duplicate files, choose which ones to delete #
##################################################################
if not all_duplicates:
print "No duplicate files found."
else:
# for each duplicate file
for hash_value in all_duplicates:
files_to_delete = []
print "\nduplicate hash", hash_value
duplicate_file_list = all_duplicates[hash_value]
print_list_of_dir_tups(duplicate_file_list)
indicies_for_deletion = duplicate_check_input_dialogue(len(duplicate_file_list))
if not indicies_for_deletion:
continue
indicies_for_deletion.sort()
for index in range(len(duplicate_file_list)):
if indicies_for_deletion and index == indicies_for_deletion[0]:
files_to_delete.append(duplicate_file_list[indicies_for_deletion.pop(0)])
print "\nThis will keep:"
for x in duplicate_file_list:
if x not in files_to_delete:
print x
print "\nThis will delete:"
for x in files_to_delete:
print x
print "Is this correct?"
response = raw_input("[return] to accept (anything else to skip)")
if response:
print "skip"
else:
for x in files_to_delete:
print "deleting", x
prompt = raw_input("done")