-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathid_pairs.py
executable file
·26 lines (19 loc) · 921 Bytes
/
id_pairs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#!/usr/bin/env python
import argparse
psr = argparse.ArgumentParser("generate id pairs")
psr.add_argument("-o", dest='opt', help="output")
psr.add_argument('ipt', help="input")
psr.add_argument('--field', default='org', help="the field to count common entries in")
args = psr.parse_args()
import pandas as pd, itertools as it, h5py, numpy as np
au = pd.read_csv(args.ipt)
# the central fucntion is sum((Counter(al[1]) & Counter(bl[1])).values())
# it counts the common org of a and b including duplications. For
# example, if a has 3 "Tsinghua" and b has 2, the common org is
# counted as 2.
# this is expanded to be used with keywords as well
dl = ((al[0], bl[0])
for (al, bl) in it.combinations(au.groupby('id')[args.field],2))
x = np.array(list(dl), dtype=[('id1', 'S24'), ('id2', 'S24')])
with h5py.File(args.opt, 'w') as opt:
opt.create_dataset('id_pairs', data=x, compression="gzip", shuffle=True)