296 lines
9.8 KiB
Python
296 lines
9.8 KiB
Python
from __future__ import absolute_import
|
|
from __future__ import print_function
|
|
from __future__ import division
|
|
|
|
import argparse
|
|
import re
|
|
from bson.json_util import loads
|
|
|
|
KEYWORDS = [
|
|
'AAAI', 'Association for the Advancement of Artificial Intelligence',
|
|
'CIKM', 'Conference on Information and Knowledge Management', 'CVPR',
|
|
'Conference on Computer Vision and Pattern Recognition', 'ECIR',
|
|
'European Conference on Information Retrieval', 'ECML',
|
|
'European Conference on Machine Learning', 'EDBT',
|
|
'International Conference on Extending Database Technology', 'ICDE',
|
|
'International Conference on Data Engineering', 'ICDM',
|
|
'International Conference on Data Mining', 'ICML',
|
|
'International Conference on Machine Learning', 'IJCAI',
|
|
'International Joint Conference on Artificial Intelligence', 'PAKDD',
|
|
'Pacific-Asia Conference on Knowledge Discovery and Data Mining', 'PKDD',
|
|
'Principles and Practice of Knowledge Discovery in Databases', 'KDD',
|
|
'Knowledge Discovery and Data Mining', 'PODS',
|
|
'Principles of Database Systems'
|
|
'SIGIR', 'Special Interest Group on Information Retrieval', 'SIGMOD',
|
|
'Special Interest Group on Management of Data', 'VLDB',
|
|
'Very Large Data Bases', 'WWW', 'World Wide Web Conference', 'WSDM',
|
|
'Web Search and Data Mining', 'SDM',
|
|
'SIAM International Conference on Data Mining'
|
|
]
|
|
CONF2ORG = {
|
|
'AAAI': 'AAAI',
|
|
'CIKM': 'ACM',
|
|
'CVPR': 'IEEE',
|
|
'ECIR': 'Springer',
|
|
'ECML': 'Springer',
|
|
'EDBT': 'Springer',
|
|
'ICDE': 'IEEE',
|
|
'ICDM': 'IEEE',
|
|
'ICML': 'PMLR',
|
|
'IJCAI': 'the IJCAI, Inc.',
|
|
'KDD': 'ACM',
|
|
'PAKDD': 'Springer',
|
|
'PKDD': 'Springer',
|
|
'PODS': 'ACM',
|
|
'SDM': 'SIAM',
|
|
'SIGIR': 'ACM',
|
|
'SIGMOD': 'ACM',
|
|
'VLDB': 'VLDB',
|
|
'WWW': 'ACM',
|
|
'WSDM': 'ACM'
|
|
}
|
|
|
|
LABELS = [
|
|
'Database', 'Data mining', 'Artificial intelligence',
|
|
'Information retrieval'
|
|
]
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--choice', type=int, default=-1)
|
|
parser.add_argument('--input_path', type=str, default='')
|
|
parser.add_argument('--output_path', type=str, default='')
|
|
args = parser.parse_args()
|
|
|
|
|
|
def extract_considered():
|
|
keywords = [val.lower() for val in KEYWORDS]
|
|
pat = re.compile(r'|'.join(keywords))
|
|
|
|
ent = 0
|
|
cnt = 0
|
|
rsvd = 0
|
|
ops = open(args.output_path, 'w')
|
|
try:
|
|
with open(args.input_path, 'r') as ips:
|
|
ele_contents = []
|
|
is_first = True
|
|
for line in ips:
|
|
if is_first:
|
|
is_first = False
|
|
continue
|
|
|
|
if line[0] == '{':
|
|
ent += 1
|
|
elif line[0] == '}':
|
|
ent -= 1
|
|
|
|
ele_contents.append(line.strip())
|
|
|
|
if ent == 0 and len(ele_contents):
|
|
json_text = ''.join(ele_contents)
|
|
json_text = re.sub(r'NumberInt\s*\(\s*(\S+)\s*\)',
|
|
r'{"$numberInt": "\1"}', json_text)
|
|
# print(json_text[:-1])
|
|
# ele = json.loads(json_text[:-1])
|
|
if json_text[-1] == ',':
|
|
ele = loads(json_text[:-1])
|
|
else:
|
|
ele = loads(json_text)
|
|
# if ('venue' in ele and '_id' in ele['venue']) and
|
|
# 'fos' in ele and 'references' in ele:
|
|
if '_id' in ele and 'venue' in ele and 'raw' in ele[
|
|
'venue'] and ele['venue']['raw'] and 'fos' in \
|
|
ele and ele[
|
|
'fos'] and 'references' in ele and 'title' \
|
|
in ele and ele[
|
|
'title']:
|
|
raw_vanue_name = ele['venue']['raw'].lower()
|
|
if re.search(pat, raw_vanue_name):
|
|
ops.write("{}\t{}\t{}\t{}\t{}\n".format(
|
|
ele['_id'], ele['venue']['raw'].replace(
|
|
'\n', '').replace('\t', ' '),
|
|
ele['title'].replace('\n',
|
|
'').replace('\t', ' '),
|
|
','.join(ele['fos']).replace('\n', '').replace(
|
|
'\t', ' '), ','.join(ele['references'])))
|
|
rsvd += 1
|
|
# print(ele)
|
|
cnt += 1
|
|
if cnt % 100000 == 0:
|
|
print(rsvd, cnt, "======>")
|
|
ele_contents = []
|
|
except Exception as ex:
|
|
print(ex)
|
|
finally:
|
|
ops.close()
|
|
|
|
|
|
"""
|
|
{'ICDM': 4589, 'KDD': 5476, 'IJCAI': 7586, 'VLDB': 5314, 'PAKDD': 2242,
|
|
'ECIR': 1482, 'ICML': 8322, 'CIKM': 5931, 'WWW': 5553, 'CVPR': 13355,
|
|
'EDBT': 1636, 'AAAI': 9695, 'ECML': 2216, 'SIGMOD': 4206, 'ICDE': 4330,
|
|
'PODS': 1670, 'SDM': 1624, 'SIGIR': 4619, 'WSDM': 746, 'PKDD': 547}
|
|
======================
|
|
{'IEEE': 22274, 'ACM': 28201, 'the IJCAI, Inc.': 7586, 'VLDB': 5314,
|
|
'Springer': 8123, 'PMLR': 8322, 'AAAI': 9695, 'SIAM': 1624}
|
|
"""
|
|
|
|
|
|
def be_canonical():
|
|
keywords = [val.lower() for val in KEYWORDS]
|
|
conf_cnts = dict()
|
|
org_cnts = dict()
|
|
ops = open(args.output_path, 'w')
|
|
with open(args.input_path, 'r') as ips:
|
|
for line in ips:
|
|
num_of_tab = line.count('\t')
|
|
if num_of_tab != 4:
|
|
print(num_of_tab)
|
|
print(line.replace('\t', 'TAB'))
|
|
continue
|
|
cols = line.strip().split('\t')
|
|
conf_raw_name = cols[1].lower()
|
|
org, conf_name = '', ''
|
|
for i, kw in enumerate(keywords):
|
|
if kw in conf_raw_name:
|
|
conf_name = keywords[i if (i % 2 == 0) else
|
|
(i - 1)].upper()
|
|
org = CONF2ORG[conf_name]
|
|
break
|
|
if conf_name == '':
|
|
print(cols[1])
|
|
continue
|
|
if conf_name not in conf_cnts:
|
|
conf_cnts[conf_name] = 0
|
|
if org not in org_cnts:
|
|
org_cnts[org] = 0
|
|
conf_cnts[conf_name] += 1
|
|
org_cnts[org] += 1
|
|
ops.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
|
|
cols[0], conf_name, org, cols[2], cols[3], cols[4]))
|
|
ops.close()
|
|
|
|
print(conf_cnts)
|
|
print("======================")
|
|
print(org_cnts)
|
|
|
|
|
|
def be_fourclass_data():
|
|
labels = [val.lower() for val in LABELS]
|
|
cnt = 0
|
|
vset = dict()
|
|
with open(args.input_path, 'r') as ips:
|
|
for line in ips:
|
|
cols = line.strip().split('\t')
|
|
fos = [val.lower() for val in cols[4].split(',')]
|
|
for val in fos:
|
|
if val in labels:
|
|
cnt += 1
|
|
vset[cols[0]] = [0, 0]
|
|
# assume single label or say the classes are exclusive
|
|
break
|
|
print(cnt)
|
|
|
|
e_cnt = 0
|
|
with open(args.input_path, 'r') as ips:
|
|
for line in ips:
|
|
cols = line.strip().split('\t')
|
|
if cols[0] not in vset:
|
|
continue
|
|
refs = cols[-1].split(',')
|
|
for val in refs:
|
|
if val in vset:
|
|
e_cnt += 1
|
|
vset[cols[0]][0] += 1
|
|
vset[val][1] += 1
|
|
print(e_cnt)
|
|
|
|
connected = dict([(val, i) for i, val in enumerate(
|
|
[k for k, v in vset.items() if (v[0] > 0 or v[1] > 0)])])
|
|
print(len(connected))
|
|
|
|
ops = open(args.output_path, 'w')
|
|
with open(args.input_path, 'r') as ips:
|
|
for line in ips:
|
|
cols = line.strip().split('\t')
|
|
nid = cols[0]
|
|
if nid not in connected:
|
|
continue
|
|
for val in cols[4].split(','):
|
|
can_val = val.lower()
|
|
if can_val in labels:
|
|
lb = labels.index(can_val)
|
|
break
|
|
adjs = ','.join([
|
|
str(connected[val]) for val in cols[-1].split(',')
|
|
if val in connected
|
|
])
|
|
ops.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
|
|
connected[nid], cols[1], cols[2], cols[3], lb, adjs))
|
|
ops.close()
|
|
|
|
|
|
def stats():
|
|
p2c = dict()
|
|
p2o = dict()
|
|
with open(args.input_path, 'r') as ips:
|
|
for line in ips:
|
|
cols = line.strip().split('\t')
|
|
p2c[cols[0]] = cols[1]
|
|
p2o[cols[0]] = cols[2]
|
|
|
|
stats = dict()
|
|
with open(args.input_path, 'r') as ips:
|
|
for line in ips:
|
|
cols = line.strip().split('\t')
|
|
conf = cols[1]
|
|
if conf not in stats:
|
|
stats[conf] = [0, 0, 0, [0, 0, 0, 0]]
|
|
stats[conf][0] += 1
|
|
adjs = cols[-1].split(',')
|
|
for v in adjs:
|
|
if p2c[v] == conf:
|
|
stats[conf][1] += 1
|
|
else:
|
|
stats[conf][2] += 1
|
|
lb = int(cols[4])
|
|
stats[conf][3][lb] += 1
|
|
|
|
for k, v in stats.items():
|
|
print(k, v)
|
|
|
|
stats = dict()
|
|
with open(args.input_path, 'r') as ips:
|
|
for line in ips:
|
|
cols = line.strip().split('\t')
|
|
org = cols[2]
|
|
if org not in stats:
|
|
stats[org] = [0, 0, 0, [0, 0, 0, 0]]
|
|
stats[org][0] += 1
|
|
adjs = cols[-1].split(',')
|
|
for v in adjs:
|
|
if p2o[v] == org:
|
|
stats[org][1] += 1
|
|
else:
|
|
stats[org][2] += 1
|
|
lb = int(cols[4])
|
|
stats[org][3][lb] += 1
|
|
|
|
for k, v in stats.items():
|
|
print(k, v)
|
|
|
|
|
|
def main():
|
|
if args.choice == 0:
|
|
extract_considered()
|
|
elif args.choice == 1:
|
|
be_canonical()
|
|
elif args.choice == 2:
|
|
be_fourclass_data()
|
|
elif args.choice == 3:
|
|
stats()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|