Merge pull request #80 from CCBR/issue78

issue78
CCBR · Feb 14, 2024 · bb45277 · bb45277
2 parents a1b6b92 + e573f1f
commit bb45277
Show file tree

Hide file tree

Showing 7 changed files with 86 additions and 49 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,17 @@
 
 ### Bug fixes
 
+## spacesavers2 0.11.1
+
+### New features
+
+- `spacesavers2_e2e` now shows more defaults in "--help"
+
+### Bug fixes
+
+- grubbers do not include non-duplicate copies (aka hardlinks) (#78, @kopardev)
+- blamematix reports numbers correctly (minor bug fix)
+
 ## spacesavers2 0.11.0
 
 ### New features

diff --git a/spacesavers2_e2e b/spacesavers2_e2e
@@ -12,9 +12,9 @@ ARGPARSE_DESCRIPTION="End-to-end run of spacesavers2"
 source ${SCRIPT_DIR}/resources/argparse.bash || exit 1
 argparse "$@" <<EOF || exit 1
 parser.add_argument('-f','--folder',required=True, help='Folder to run spacesavers_catalog on.')
-parser.add_argument('-p','--threads',required=False, help='number of threads to use', default=4)
-parser.add_argument('-d','--maxdepth',required=False, help='maxdepth for mimeo', default=4)
-parser.add_argument('-l','--limit',required=False, help='limit for running spacesavers_grubbers', default=5)
+parser.add_argument('-p','--threads',required=False, help='number of threads to use for catalog (default 4)', default=4)
+parser.add_argument('-d','--maxdepth',required=False, help='maxdepth for mimeo (default 4)', default=4)
+parser.add_argument('-l','--limit',required=False, help='limit for running spacesavers_grubbers (default 5)', default=5)
 parser.add_argument('-q','--quota',required=False, help='total size of the volume (default = 200 for /data/CCBR)', default=200)
 parser.add_argument('-o','--outfolder',required=True, help='Folder where all spacesavers_e2e output files will be saved')
 EOF

diff --git a/spacesavers2_grubbers b/spacesavers2_grubbers
@@ -80,12 +80,12 @@ def main():
             properly_set = dfu.set(l)
             if not properly_set: # could not read line properly or there are no duplicates
                 continue
-            if dfu.ndup == 0: continue # in case mimeo was run without -z
+            if dfu.ndup_uid_inode == 0: continue # in case mimeo was run without -z
             dups.append(dfu)
 
     dups.sort() # look at __lt__ ... its sorting from highest to lowest totalsize
     saved = 0
-    top_limit = args.limit * 1024 * 1024 * 1024  # 5 GiB
+    top_limit = args.limit * 1024 * 1024 * 1024  # 5 GiB Default
     if args.outfile:
         of = open(args.outfile, "w")
     else:
@@ -95,6 +95,8 @@ def main():
         if fgitem.totalsize <= top_limit:
             break
         saved += fgitem.totalsize
+        outstr = str(fgitem)
+        if outstr == "": continue
         of.write("%s\n"%(fgitem))
 
     if args.outfile:

diff --git a/spacesavers2_mimeo b/spacesavers2_mimeo
@@ -56,7 +56,8 @@ def process_hh(
         if foldest.uid == uid or 0 == uid : user_owns_original = True
         uid_file_index = list(filter(lambda x:x!=oldest_index,uid_file_index)) # remove oldest if present in list
         inodes_already_summerized = [foldest.inode]
-        if hashhash[h].ndup_files > 0: # we have duplicates
+        # if hashhash[h].ndup_files > 0: # we have duplicates
+        if len(uid_file_index) > 0: # uid has copies
             for i in uid_file_index:
                 f = hashhash[h].flist[i]
                 fpath = f.apath
@@ -281,6 +282,7 @@ def main():
     )
     blamematrix = dict()
     all_blamematrix_paths = set()
+    # users=[0] # debug only
     for uid in users:
         blamematrix[uid] = dict()
         print_with_timestamp(
@@ -363,9 +365,8 @@ def main():
                 perfolder_summaries[p].update_scores(quota)
                 user_summary.write(f"{perfolder_summaries[p]}\n")
             for p in perfolder_summaries:
-                dummy = FileDetails()
-                dummy.initialize(p)
-                if dummy.get_depth() == mindepth + 1:
+                p_depth = len(list(p.parents))
+                if p_depth == mindepth:
                     all_blamematrix_paths.add(p)
                     blamematrix[uid][p] = sum(perfolder_summaries[p].dup_Bytes)
 

diff --git a/src/FileDetails.py b/src/FileDetails.py
@@ -28,18 +28,21 @@ def convert_time_to_age(t):
 
 def get_type(p):
     x = "u" # unknown
-    if not p.exists():
-        x = "a" # absent
-        return x
-    if p.is_symlink():
-        x = "l" # link or symlink
-        return x
-    if p.is_dir():
-        x = "d" # directory
-        return x
-    if p.is_file():
-        x = "f" # file
-        return x
+    try:
+        if not p.exists():
+            x = "a" # absent
+            return x
+        if p.is_symlink():
+            x = "l" # link or symlink
+            return x
+        if p.is_dir():
+            x = "d" # directory
+            return x
+        if p.is_file():
+            x = "f" # file
+            return x
+    except: # mainly to catch PermissionError:
+        sys.stderr.write("spacesavers2:File cannot be read:{}\n".format(p))
     return x
 
 class FileDetails:

diff --git a/src/VERSION b/src/VERSION
@@ -1 +1 @@
-0.11.0
+0.11.1
diff --git a/src/dfUnit.py b/src/dfUnit.py
@@ -1,13 +1,6 @@
 import sys
 from .utils import *
 
-def get_filename_from_fgzlistitem(string):
-    string = string.strip().split(";")[:-1]
-    for i in range(11):
-        dummy = string.pop(-1)
-    filename = ";".join(string)
-    return filename
-
 
 class dfUnit:
     def __init__(self,hash):
@@ -70,9 +63,9 @@ def compute(self,hashhashsplits):
                     if fd.size == size:
                         hashhashsplits[newhash].add_fd(fd)
         else: # there only 1 size ... no splits required
-            self.ndup   = len(self.inode_list) - 1  #ndup is zero if same len(size_set)==1 and len(inode_list)==1
+            self.ndup   = len(self.inode_list) - 1  # ndup is zero if same len(size_set)==1 and len(inode_list)==1
             self.ndup_inode = len(set(self.inode_list)) - 1
-            self.ndup_files = len(self.inode_list) - 1
+            self.ndup_files = len(self.inode_list) - 1 # some duplicate files may be hard links
             self.fsize  = self.flist[0].calculated_size
         return split_required
 
@@ -92,52 +85,79 @@ def __str__(self):
         return "{0} : {1} {2} {3}".format(self.hash, self.ndup_inode, self.fsize,"##".join(map(lambda x:str(x),self.flist)))
 
     def str_with_name(self,uid2uname, gid2gname,findex):
-        return "{0} : {1} {2} {3}".format(self.hash, self.ndup_inode, self.fsize,"##".join(map(lambda x:x.str_with_name(uid2uname,gid2gname),[self.flist[i] for i in findex])))
+        original_inode = self.flist[findex[0]].inode # first file is the original
+        uid_inodes = [self.flist[i].inode for i in findex[1:]] # 2nd file onwards could be duplicates (same file but different inode) or copy (same inode <-> hardlink)
+        dup_uid_inodes = list(filter(lambda x:x!=original_inode,uid_inodes))
+        ndup_uid_inodes = len(dup_uid_inodes)
+        return "{0} : {1} {2} {3} {4}".format(self.hash, 
+                                              self.ndup_inode,  # total number of duplicates (all users)
+                                              ndup_uid_inodes,  # total number of duplicates (this uid)
+                                              self.fsize,       # size of each duplicate
+                                              "##".join(map(lambda x:x.str_with_name(uid2uname,gid2gname),[self.flist[i] for i in findex])))
 
+def _get_inode(s):
+    s = s.strip().split(";")
+    return(s[-8])
+
+def _get_filename_from_fgzlistitem(string):
+    string = string.strip().split(";")[:-1]
+    for i in range(11):
+        dummy = string.pop(-1)
+    filename = ";".join(string)
+    return filename
 
 class fgz: # used by grubber
     def __init__(self):
         self.hash = ""
-        self.ndup = -1 # number of duplicate files and not duplicate inodes
-        self.filesize = -1
-        self.totalsize = -1
-        self.fds = [] # list of duplicate files
-        self.of = "" # original file 
+        self.ndup_inode = -1        # number of duplicate inodes (all users)
+        self.ndup_uid_inode = -1    # number of duplicate inodes (this uid)
+        self.filesize = -1          # size of each duplicate or copy
+        self.totalsize = -1         # total size of duplicates for this uid
+        self.fds = []               # list of duplicate files for this uid
+        self.of = ""                # original file aka non-duplicate file
 
     def __lt__(self,other):
         return self.totalsize > other.totalsize
 
     def __str__(self):
+        dup_fds = []
+        inodes_seen = dict()
+        inodes_seen[_get_inode(self.of)] = 1
+        for f in self.fds:
+            f_inode = _get_inode(f)
+            if f_inode in inodes_seen: continue
+            inodes_seen[f_inode] = 1
+            dup_fds.append(f)
         outstring=[]
         outstring.append(str(self.hash))
-        outstring.append(str(self.ndup))
+        outstring.append(str(self.ndup_uid_inode))
         outstring.append(str(self.totalsize))
         outstring.append(str(self.filesize))
-        outstring.append(get_filename_from_fgzlistitem(self.of))
-        outstring.append("##".join(map(lambda x:get_filename_from_fgzlistitem(x),self.fds)))
+        outstring.append(_get_filename_from_fgzlistitem(self.of))
+        outstring.append("##".join(map(lambda x:_get_filename_from_fgzlistitem(x),dup_fds)))
         return "\t".join(outstring)
-        # return "{0} {1} {2} {3} {4}".format(self.hash,self.ndup,get_human_readable_size(self.totalsize), get_human_readable_size(self.filesize), ";".join(map(lambda x:get_filename_from_fgzlistitem(x),self.fds)))
-        # return "{0} {1} {2} {3} {4}".format(self.hash,self.ndup,self.totalsize, self.filesize, ";".join(map(lambda x:get_filename_from_fgzlistitem(x),self.fds)))
+        # return "{0} {1} {2} {3} {4}".format(self.hash,self.ndup,get_human_readable_size(self.totalsize), get_human_readable_size(self.filesize), ";".join(map(lambda x:_get_filename_from_fgzlistitem(x),self.fds)))
+        # return "{0} {1} {2} {3} {4}".format(self.hash,self.ndup,self.totalsize, self.filesize, ";".join(map(lambda x:_get_filename_from_fgzlistitem(x),self.fds)))
 
 # 09f9599cff76f6c82a96b042d67f81ff#09f9599cff76f6c82a96b042d67f81ff : 158 1348 "/data/CCBR/projects/ccbr583/Pipeliner/.git/hooks/pre-push.sample";1348;41;8081532070425347857;1;1552;35069;57786;jailwalapa;CCBR;##"/data/CCBR/projects/ccbr785/FREEC/.git/hooks/pre-push.sample";1348;41;11610558684702129747;1;1629;35069;57786;jailwalapa;CCBR;##"/data/CCBR/projects/ccbr785/citup/pypeliner/.git/hooks/pre-push.sample";1348;41;9306919632329364056;1;1624;35069;57786;jailwalapa;CCBR;##"/data/CCBR/projects/ccbr785/titan_workflow/.git/hooks/pre-push.sample";1348;41;7658100918611057517;1;1628;35069;57786;jailwalapa;CCBR;##"/data/CCBR/rawdata/ccbr1016/batch1/fastq/scratch/example/Pipeliner/.git/hooks/pre-push.sample";1348;41;328973360624494807;1;1253;35069;57786;jailwalapa;CCBR;##"/data/CCBR/rawdata/ccbr1040/Seq2n3n4n5_GEXnHTO/Pipeliner/.git/hooks/pre-push.sample";1348;41;16190385205193530167;1;1093;35069;57786;jailwalapa;CCBR;##"/data/CCBR/rawdata/ccbr1044/Pipeliner/.git/hooks/pre-push.sample";1348;41;10429578581567757002;1;1110;35069;57786;jailwalapa;CCBR;    
     def set(self,inputline):
         original_line = inputline
         try:
             inputline = inputline.strip().split(" ")
-            if len(inputline) < 5:
-                raise Exception("Less than 5 items in mimeo.files.gz line.")
+            if len(inputline) < 6:
+                raise Exception("Less than 6 items in mimeo.files.gz line.")
             self.hash = inputline.pop(0)
             dummy = inputline.pop(0) # the colon
-            total_ndup = int(inputline.pop(0))
-            if total_ndup == 0: # may be mimeo was run to output all files .. not just dups .. aka without the -z option
+            self.ndup_inode = int(inputline.pop(0))
+            self.ndup_uid_inode = int(inputline.pop(0))
+            if self.ndup_uid_inode == 0: # may be mimeo was run to output all files .. not just dups .. aka without the -z option
                 return False
             self.filesize = int(inputline.pop(0))
             full_fds = " ".join(inputline) # bcos file names can contain spaces
             fds = full_fds.split("##")
-            self.ndup = total_ndup  # these are user number of duplicates/files
             self.of = fds.pop(0)    # one file is the original
-            self.fds = fds          # others are dupicates
-            self.totalsize = total_ndup * self.filesize
+            self.fds = fds          # other duplicates or copies
+            self.totalsize = self.ndup_uid_inode * self.filesize    # total size of all duplicates for this uid
             return True
         except:
             sys.stderr.write("spacesavers2:{0}:files.gz Do not understand line:{1} with {2} elements.\n".format(self.__class__.__name__,original_line,len(inputline)))