Source code for gismo.datasets.acm

import pkgutil
import gzip
import json


def get_acm_from_package():
    data = pkgutil.get_data(__package__, "acm.json.gz")
    return json.loads(gzip.decompress(data))


[docs] def get_acm(): """ Returns ------- acm: list of dicts Each dict is an ACM domain. It contains category name, query (concatenation of names from domain and subdomains), size (number of subdomains including itself), and children (list of domain dicts). Examples -------- >>> acm = get_acm() >>> subdomain = acm[4]['children'][2]['children'][1] >>> subdomain['name'] 'Software development process management' >>> subdomain['size'] 10 >>> subdomain['query'] 'Software development process management, Software development methods, Rapid application development, Agile software development, Capability Maturity Model, Waterfall model, Spiral model, V-model, Design patterns, Risk management' >>> len(acm) 13 """ return get_acm_from_package()
[docs] def flatten_acm(acm, min_size=5, max_depth=100, exclude=None, depth=0): """ Select subdomains of an acm tree and return them as a list. Parameters ---------- acm: list of dicts acm tree from get_acm. min_size: int size threshold to select a domain (avoids small domains) max_depth: int depth threshold to select a domain (avoids deep domains) exclude: list list of domains to exclude from the results Returns ------- list A flat list of domains described by name and query. Example ------- >>> acm = flatten_acm(get_acm()) >>> acm[111]['name'] 'Graph theory' """ if exclude is None: exclude = set() result = [ {"name": t["name"], "query": t["query"]} for t in acm if t["size"] > min_size ] for t in acm: if len(t["children"]) > 0 and depth < max_depth: result += flatten_acm( t["children"], min_size=min_size, max_depth=max_depth, depth=depth + 1 ) if depth == 0: result = [ {"name": key, "query": value} for key, value in {t["name"]: t["query"] for t in result}.items() if key not in exclude ] return result