diff --git a/CHANGELOG.md b/CHANGELOG.md index b65cf73..703f69b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,17 @@ ### Bug fixes +## spacesavers2 0.11.1 + +### New features + +- `spacesavers2_e2e` now shows more defaults in "--help" + +### Bug fixes + +- grubbers do not include non-duplicate copies (aka hardlinks) (#78, @kopardev) +- blamematix reports numbers correctly (minor bug fix) + ## spacesavers2 0.11.0 ### New features diff --git a/spacesavers2_e2e b/spacesavers2_e2e index 57d2bd2..3b18625 100755 --- a/spacesavers2_e2e +++ b/spacesavers2_e2e @@ -12,9 +12,9 @@ ARGPARSE_DESCRIPTION="End-to-end run of spacesavers2" source ${SCRIPT_DIR}/resources/argparse.bash || exit 1 argparse "$@" < 0: # we have duplicates + # if hashhash[h].ndup_files > 0: # we have duplicates + if len(uid_file_index) > 0: # uid has copies for i in uid_file_index: f = hashhash[h].flist[i] fpath = f.apath @@ -281,6 +282,7 @@ def main(): ) blamematrix = dict() all_blamematrix_paths = set() + # users=[0] # debug only for uid in users: blamematrix[uid] = dict() print_with_timestamp( @@ -363,9 +365,8 @@ def main(): perfolder_summaries[p].update_scores(quota) user_summary.write(f"{perfolder_summaries[p]}\n") for p in perfolder_summaries: - dummy = FileDetails() - dummy.initialize(p) - if dummy.get_depth() == mindepth + 1: + p_depth = len(list(p.parents)) + if p_depth == mindepth: all_blamematrix_paths.add(p) blamematrix[uid][p] = sum(perfolder_summaries[p].dup_Bytes) diff --git a/src/FileDetails.py b/src/FileDetails.py index ce7a4db..1b5a62c 100644 --- a/src/FileDetails.py +++ b/src/FileDetails.py @@ -28,18 +28,21 @@ def convert_time_to_age(t): def get_type(p): x = "u" # unknown - if not p.exists(): - x = "a" # absent - return x - if p.is_symlink(): - x = "l" # link or symlink - return x - if p.is_dir(): - x = "d" # directory - return x - if p.is_file(): - x = "f" # file - return x + try: + if not p.exists(): + x = "a" # absent + return x + if p.is_symlink(): + x = "l" # link or symlink + return x + if p.is_dir(): + x = "d" # directory + return x + if p.is_file(): + x = "f" # file + return x + except: # mainly to catch PermissionError: + sys.stderr.write("spacesavers2:File cannot be read:{}\n".format(p)) return x class FileDetails: diff --git a/src/VERSION b/src/VERSION index d9df1bb..af88ba8 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -0.11.0 +0.11.1 diff --git a/src/dfUnit.py b/src/dfUnit.py index 73c9e1b..fd3fb67 100644 --- a/src/dfUnit.py +++ b/src/dfUnit.py @@ -1,13 +1,6 @@ import sys from .utils import * -def get_filename_from_fgzlistitem(string): - string = string.strip().split(";")[:-1] - for i in range(11): - dummy = string.pop(-1) - filename = ";".join(string) - return filename - class dfUnit: def __init__(self,hash): @@ -70,9 +63,9 @@ def compute(self,hashhashsplits): if fd.size == size: hashhashsplits[newhash].add_fd(fd) else: # there only 1 size ... no splits required - self.ndup = len(self.inode_list) - 1 #ndup is zero if same len(size_set)==1 and len(inode_list)==1 + self.ndup = len(self.inode_list) - 1 # ndup is zero if same len(size_set)==1 and len(inode_list)==1 self.ndup_inode = len(set(self.inode_list)) - 1 - self.ndup_files = len(self.inode_list) - 1 + self.ndup_files = len(self.inode_list) - 1 # some duplicate files may be hard links self.fsize = self.flist[0].calculated_size return split_required @@ -92,52 +85,79 @@ def __str__(self): return "{0} : {1} {2} {3}".format(self.hash, self.ndup_inode, self.fsize,"##".join(map(lambda x:str(x),self.flist))) def str_with_name(self,uid2uname, gid2gname,findex): - return "{0} : {1} {2} {3}".format(self.hash, self.ndup_inode, self.fsize,"##".join(map(lambda x:x.str_with_name(uid2uname,gid2gname),[self.flist[i] for i in findex]))) + original_inode = self.flist[findex[0]].inode # first file is the original + uid_inodes = [self.flist[i].inode for i in findex[1:]] # 2nd file onwards could be duplicates (same file but different inode) or copy (same inode <-> hardlink) + dup_uid_inodes = list(filter(lambda x:x!=original_inode,uid_inodes)) + ndup_uid_inodes = len(dup_uid_inodes) + return "{0} : {1} {2} {3} {4}".format(self.hash, + self.ndup_inode, # total number of duplicates (all users) + ndup_uid_inodes, # total number of duplicates (this uid) + self.fsize, # size of each duplicate + "##".join(map(lambda x:x.str_with_name(uid2uname,gid2gname),[self.flist[i] for i in findex]))) +def _get_inode(s): + s = s.strip().split(";") + return(s[-8]) + +def _get_filename_from_fgzlistitem(string): + string = string.strip().split(";")[:-1] + for i in range(11): + dummy = string.pop(-1) + filename = ";".join(string) + return filename class fgz: # used by grubber def __init__(self): self.hash = "" - self.ndup = -1 # number of duplicate files and not duplicate inodes - self.filesize = -1 - self.totalsize = -1 - self.fds = [] # list of duplicate files - self.of = "" # original file + self.ndup_inode = -1 # number of duplicate inodes (all users) + self.ndup_uid_inode = -1 # number of duplicate inodes (this uid) + self.filesize = -1 # size of each duplicate or copy + self.totalsize = -1 # total size of duplicates for this uid + self.fds = [] # list of duplicate files for this uid + self.of = "" # original file aka non-duplicate file def __lt__(self,other): return self.totalsize > other.totalsize def __str__(self): + dup_fds = [] + inodes_seen = dict() + inodes_seen[_get_inode(self.of)] = 1 + for f in self.fds: + f_inode = _get_inode(f) + if f_inode in inodes_seen: continue + inodes_seen[f_inode] = 1 + dup_fds.append(f) outstring=[] outstring.append(str(self.hash)) - outstring.append(str(self.ndup)) + outstring.append(str(self.ndup_uid_inode)) outstring.append(str(self.totalsize)) outstring.append(str(self.filesize)) - outstring.append(get_filename_from_fgzlistitem(self.of)) - outstring.append("##".join(map(lambda x:get_filename_from_fgzlistitem(x),self.fds))) + outstring.append(_get_filename_from_fgzlistitem(self.of)) + outstring.append("##".join(map(lambda x:_get_filename_from_fgzlistitem(x),dup_fds))) return "\t".join(outstring) - # return "{0} {1} {2} {3} {4}".format(self.hash,self.ndup,get_human_readable_size(self.totalsize), get_human_readable_size(self.filesize), ";".join(map(lambda x:get_filename_from_fgzlistitem(x),self.fds))) - # return "{0} {1} {2} {3} {4}".format(self.hash,self.ndup,self.totalsize, self.filesize, ";".join(map(lambda x:get_filename_from_fgzlistitem(x),self.fds))) + # return "{0} {1} {2} {3} {4}".format(self.hash,self.ndup,get_human_readable_size(self.totalsize), get_human_readable_size(self.filesize), ";".join(map(lambda x:_get_filename_from_fgzlistitem(x),self.fds))) + # return "{0} {1} {2} {3} {4}".format(self.hash,self.ndup,self.totalsize, self.filesize, ";".join(map(lambda x:_get_filename_from_fgzlistitem(x),self.fds))) # 09f9599cff76f6c82a96b042d67f81ff#09f9599cff76f6c82a96b042d67f81ff : 158 1348 "/data/CCBR/projects/ccbr583/Pipeliner/.git/hooks/pre-push.sample";1348;41;8081532070425347857;1;1552;35069;57786;jailwalapa;CCBR;##"/data/CCBR/projects/ccbr785/FREEC/.git/hooks/pre-push.sample";1348;41;11610558684702129747;1;1629;35069;57786;jailwalapa;CCBR;##"/data/CCBR/projects/ccbr785/citup/pypeliner/.git/hooks/pre-push.sample";1348;41;9306919632329364056;1;1624;35069;57786;jailwalapa;CCBR;##"/data/CCBR/projects/ccbr785/titan_workflow/.git/hooks/pre-push.sample";1348;41;7658100918611057517;1;1628;35069;57786;jailwalapa;CCBR;##"/data/CCBR/rawdata/ccbr1016/batch1/fastq/scratch/example/Pipeliner/.git/hooks/pre-push.sample";1348;41;328973360624494807;1;1253;35069;57786;jailwalapa;CCBR;##"/data/CCBR/rawdata/ccbr1040/Seq2n3n4n5_GEXnHTO/Pipeliner/.git/hooks/pre-push.sample";1348;41;16190385205193530167;1;1093;35069;57786;jailwalapa;CCBR;##"/data/CCBR/rawdata/ccbr1044/Pipeliner/.git/hooks/pre-push.sample";1348;41;10429578581567757002;1;1110;35069;57786;jailwalapa;CCBR; def set(self,inputline): original_line = inputline try: inputline = inputline.strip().split(" ") - if len(inputline) < 5: - raise Exception("Less than 5 items in mimeo.files.gz line.") + if len(inputline) < 6: + raise Exception("Less than 6 items in mimeo.files.gz line.") self.hash = inputline.pop(0) dummy = inputline.pop(0) # the colon - total_ndup = int(inputline.pop(0)) - if total_ndup == 0: # may be mimeo was run to output all files .. not just dups .. aka without the -z option + self.ndup_inode = int(inputline.pop(0)) + self.ndup_uid_inode = int(inputline.pop(0)) + if self.ndup_uid_inode == 0: # may be mimeo was run to output all files .. not just dups .. aka without the -z option return False self.filesize = int(inputline.pop(0)) full_fds = " ".join(inputline) # bcos file names can contain spaces fds = full_fds.split("##") - self.ndup = total_ndup # these are user number of duplicates/files self.of = fds.pop(0) # one file is the original - self.fds = fds # others are dupicates - self.totalsize = total_ndup * self.filesize + self.fds = fds # other duplicates or copies + self.totalsize = self.ndup_uid_inode * self.filesize # total size of all duplicates for this uid return True except: sys.stderr.write("spacesavers2:{0}:files.gz Do not understand line:{1} with {2} elements.\n".format(self.__class__.__name__,original_line,len(inputline)))