Skip to content

Commit

Permalink
Merge pull request #80 from CCBR/issue78
Browse files Browse the repository at this point in the history
issue78
  • Loading branch information
kopardev authored Feb 14, 2024
2 parents a1b6b92 + e573f1f commit bb45277
Show file tree
Hide file tree
Showing 7 changed files with 86 additions and 49 deletions.
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,17 @@

### Bug fixes

## spacesavers2 0.11.1

### New features

- `spacesavers2_e2e` now shows more defaults in "--help"

### Bug fixes

- grubbers do not include non-duplicate copies (aka hardlinks) (#78, @kopardev)
- blamematix reports numbers correctly (minor bug fix)

## spacesavers2 0.11.0

### New features
Expand Down
6 changes: 3 additions & 3 deletions spacesavers2_e2e
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ ARGPARSE_DESCRIPTION="End-to-end run of spacesavers2"
source ${SCRIPT_DIR}/resources/argparse.bash || exit 1
argparse "$@" <<EOF || exit 1
parser.add_argument('-f','--folder',required=True, help='Folder to run spacesavers_catalog on.')
parser.add_argument('-p','--threads',required=False, help='number of threads to use', default=4)
parser.add_argument('-d','--maxdepth',required=False, help='maxdepth for mimeo', default=4)
parser.add_argument('-l','--limit',required=False, help='limit for running spacesavers_grubbers', default=5)
parser.add_argument('-p','--threads',required=False, help='number of threads to use for catalog (default 4)', default=4)
parser.add_argument('-d','--maxdepth',required=False, help='maxdepth for mimeo (default 4)', default=4)
parser.add_argument('-l','--limit',required=False, help='limit for running spacesavers_grubbers (default 5)', default=5)
parser.add_argument('-q','--quota',required=False, help='total size of the volume (default = 200 for /data/CCBR)', default=200)
parser.add_argument('-o','--outfolder',required=True, help='Folder where all spacesavers_e2e output files will be saved')
EOF
Expand Down
6 changes: 4 additions & 2 deletions spacesavers2_grubbers
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,12 @@ def main():
properly_set = dfu.set(l)
if not properly_set: # could not read line properly or there are no duplicates
continue
if dfu.ndup == 0: continue # in case mimeo was run without -z
if dfu.ndup_uid_inode == 0: continue # in case mimeo was run without -z
dups.append(dfu)

dups.sort() # look at __lt__ ... its sorting from highest to lowest totalsize
saved = 0
top_limit = args.limit * 1024 * 1024 * 1024 # 5 GiB
top_limit = args.limit * 1024 * 1024 * 1024 # 5 GiB Default
if args.outfile:
of = open(args.outfile, "w")
else:
Expand All @@ -95,6 +95,8 @@ def main():
if fgitem.totalsize <= top_limit:
break
saved += fgitem.totalsize
outstr = str(fgitem)
if outstr == "": continue
of.write("%s\n"%(fgitem))

if args.outfile:
Expand Down
9 changes: 5 additions & 4 deletions spacesavers2_mimeo
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ def process_hh(
if foldest.uid == uid or 0 == uid : user_owns_original = True
uid_file_index = list(filter(lambda x:x!=oldest_index,uid_file_index)) # remove oldest if present in list
inodes_already_summerized = [foldest.inode]
if hashhash[h].ndup_files > 0: # we have duplicates
# if hashhash[h].ndup_files > 0: # we have duplicates
if len(uid_file_index) > 0: # uid has copies
for i in uid_file_index:
f = hashhash[h].flist[i]
fpath = f.apath
Expand Down Expand Up @@ -281,6 +282,7 @@ def main():
)
blamematrix = dict()
all_blamematrix_paths = set()
# users=[0] # debug only
for uid in users:
blamematrix[uid] = dict()
print_with_timestamp(
Expand Down Expand Up @@ -363,9 +365,8 @@ def main():
perfolder_summaries[p].update_scores(quota)
user_summary.write(f"{perfolder_summaries[p]}\n")
for p in perfolder_summaries:
dummy = FileDetails()
dummy.initialize(p)
if dummy.get_depth() == mindepth + 1:
p_depth = len(list(p.parents))
if p_depth == mindepth:
all_blamematrix_paths.add(p)
blamematrix[uid][p] = sum(perfolder_summaries[p].dup_Bytes)

Expand Down
27 changes: 15 additions & 12 deletions src/FileDetails.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,21 @@ def convert_time_to_age(t):

def get_type(p):
x = "u" # unknown
if not p.exists():
x = "a" # absent
return x
if p.is_symlink():
x = "l" # link or symlink
return x
if p.is_dir():
x = "d" # directory
return x
if p.is_file():
x = "f" # file
return x
try:
if not p.exists():
x = "a" # absent
return x
if p.is_symlink():
x = "l" # link or symlink
return x
if p.is_dir():
x = "d" # directory
return x
if p.is_file():
x = "f" # file
return x
except: # mainly to catch PermissionError:
sys.stderr.write("spacesavers2:File cannot be read:{}\n".format(p))
return x

class FileDetails:
Expand Down
2 changes: 1 addition & 1 deletion src/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.11.0
0.11.1
74 changes: 47 additions & 27 deletions src/dfUnit.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
import sys
from .utils import *

def get_filename_from_fgzlistitem(string):
string = string.strip().split(";")[:-1]
for i in range(11):
dummy = string.pop(-1)
filename = ";".join(string)
return filename


class dfUnit:
def __init__(self,hash):
Expand Down Expand Up @@ -70,9 +63,9 @@ def compute(self,hashhashsplits):
if fd.size == size:
hashhashsplits[newhash].add_fd(fd)
else: # there only 1 size ... no splits required
self.ndup = len(self.inode_list) - 1 #ndup is zero if same len(size_set)==1 and len(inode_list)==1
self.ndup = len(self.inode_list) - 1 # ndup is zero if same len(size_set)==1 and len(inode_list)==1
self.ndup_inode = len(set(self.inode_list)) - 1
self.ndup_files = len(self.inode_list) - 1
self.ndup_files = len(self.inode_list) - 1 # some duplicate files may be hard links
self.fsize = self.flist[0].calculated_size
return split_required

Expand All @@ -92,52 +85,79 @@ def __str__(self):
return "{0} : {1} {2} {3}".format(self.hash, self.ndup_inode, self.fsize,"##".join(map(lambda x:str(x),self.flist)))

def str_with_name(self,uid2uname, gid2gname,findex):
return "{0} : {1} {2} {3}".format(self.hash, self.ndup_inode, self.fsize,"##".join(map(lambda x:x.str_with_name(uid2uname,gid2gname),[self.flist[i] for i in findex])))
original_inode = self.flist[findex[0]].inode # first file is the original
uid_inodes = [self.flist[i].inode for i in findex[1:]] # 2nd file onwards could be duplicates (same file but different inode) or copy (same inode <-> hardlink)
dup_uid_inodes = list(filter(lambda x:x!=original_inode,uid_inodes))
ndup_uid_inodes = len(dup_uid_inodes)
return "{0} : {1} {2} {3} {4}".format(self.hash,
self.ndup_inode, # total number of duplicates (all users)
ndup_uid_inodes, # total number of duplicates (this uid)
self.fsize, # size of each duplicate
"##".join(map(lambda x:x.str_with_name(uid2uname,gid2gname),[self.flist[i] for i in findex])))

def _get_inode(s):
s = s.strip().split(";")
return(s[-8])

def _get_filename_from_fgzlistitem(string):
string = string.strip().split(";")[:-1]
for i in range(11):
dummy = string.pop(-1)
filename = ";".join(string)
return filename

class fgz: # used by grubber
def __init__(self):
self.hash = ""
self.ndup = -1 # number of duplicate files and not duplicate inodes
self.filesize = -1
self.totalsize = -1
self.fds = [] # list of duplicate files
self.of = "" # original file
self.ndup_inode = -1 # number of duplicate inodes (all users)
self.ndup_uid_inode = -1 # number of duplicate inodes (this uid)
self.filesize = -1 # size of each duplicate or copy
self.totalsize = -1 # total size of duplicates for this uid
self.fds = [] # list of duplicate files for this uid
self.of = "" # original file aka non-duplicate file

def __lt__(self,other):
return self.totalsize > other.totalsize

def __str__(self):
dup_fds = []
inodes_seen = dict()
inodes_seen[_get_inode(self.of)] = 1
for f in self.fds:
f_inode = _get_inode(f)
if f_inode in inodes_seen: continue
inodes_seen[f_inode] = 1
dup_fds.append(f)
outstring=[]
outstring.append(str(self.hash))
outstring.append(str(self.ndup))
outstring.append(str(self.ndup_uid_inode))
outstring.append(str(self.totalsize))
outstring.append(str(self.filesize))
outstring.append(get_filename_from_fgzlistitem(self.of))
outstring.append("##".join(map(lambda x:get_filename_from_fgzlistitem(x),self.fds)))
outstring.append(_get_filename_from_fgzlistitem(self.of))
outstring.append("##".join(map(lambda x:_get_filename_from_fgzlistitem(x),dup_fds)))
return "\t".join(outstring)
# return "{0} {1} {2} {3} {4}".format(self.hash,self.ndup,get_human_readable_size(self.totalsize), get_human_readable_size(self.filesize), ";".join(map(lambda x:get_filename_from_fgzlistitem(x),self.fds)))
# return "{0} {1} {2} {3} {4}".format(self.hash,self.ndup,self.totalsize, self.filesize, ";".join(map(lambda x:get_filename_from_fgzlistitem(x),self.fds)))
# return "{0} {1} {2} {3} {4}".format(self.hash,self.ndup,get_human_readable_size(self.totalsize), get_human_readable_size(self.filesize), ";".join(map(lambda x:_get_filename_from_fgzlistitem(x),self.fds)))
# return "{0} {1} {2} {3} {4}".format(self.hash,self.ndup,self.totalsize, self.filesize, ";".join(map(lambda x:_get_filename_from_fgzlistitem(x),self.fds)))

# 09f9599cff76f6c82a96b042d67f81ff#09f9599cff76f6c82a96b042d67f81ff : 158 1348 "/data/CCBR/projects/ccbr583/Pipeliner/.git/hooks/pre-push.sample";1348;41;8081532070425347857;1;1552;35069;57786;jailwalapa;CCBR;##"/data/CCBR/projects/ccbr785/FREEC/.git/hooks/pre-push.sample";1348;41;11610558684702129747;1;1629;35069;57786;jailwalapa;CCBR;##"/data/CCBR/projects/ccbr785/citup/pypeliner/.git/hooks/pre-push.sample";1348;41;9306919632329364056;1;1624;35069;57786;jailwalapa;CCBR;##"/data/CCBR/projects/ccbr785/titan_workflow/.git/hooks/pre-push.sample";1348;41;7658100918611057517;1;1628;35069;57786;jailwalapa;CCBR;##"/data/CCBR/rawdata/ccbr1016/batch1/fastq/scratch/example/Pipeliner/.git/hooks/pre-push.sample";1348;41;328973360624494807;1;1253;35069;57786;jailwalapa;CCBR;##"/data/CCBR/rawdata/ccbr1040/Seq2n3n4n5_GEXnHTO/Pipeliner/.git/hooks/pre-push.sample";1348;41;16190385205193530167;1;1093;35069;57786;jailwalapa;CCBR;##"/data/CCBR/rawdata/ccbr1044/Pipeliner/.git/hooks/pre-push.sample";1348;41;10429578581567757002;1;1110;35069;57786;jailwalapa;CCBR;
def set(self,inputline):
original_line = inputline
try:
inputline = inputline.strip().split(" ")
if len(inputline) < 5:
raise Exception("Less than 5 items in mimeo.files.gz line.")
if len(inputline) < 6:
raise Exception("Less than 6 items in mimeo.files.gz line.")
self.hash = inputline.pop(0)
dummy = inputline.pop(0) # the colon
total_ndup = int(inputline.pop(0))
if total_ndup == 0: # may be mimeo was run to output all files .. not just dups .. aka without the -z option
self.ndup_inode = int(inputline.pop(0))
self.ndup_uid_inode = int(inputline.pop(0))
if self.ndup_uid_inode == 0: # may be mimeo was run to output all files .. not just dups .. aka without the -z option
return False
self.filesize = int(inputline.pop(0))
full_fds = " ".join(inputline) # bcos file names can contain spaces
fds = full_fds.split("##")
self.ndup = total_ndup # these are user number of duplicates/files
self.of = fds.pop(0) # one file is the original
self.fds = fds # others are dupicates
self.totalsize = total_ndup * self.filesize
self.fds = fds # other duplicates or copies
self.totalsize = self.ndup_uid_inode * self.filesize # total size of all duplicates for this uid
return True
except:
sys.stderr.write("spacesavers2:{0}:files.gz Do not understand line:{1} with {2} elements.\n".format(self.__class__.__name__,original_line,len(inputline)))
Expand Down

0 comments on commit bb45277

Please sign in to comment.