Source code for gismo.datasets.acm
import pkgutil
import gzip
import json
def get_acm_from_package():
data = pkgutil.get_data(__package__, "acm.json.gz")
return json.loads(gzip.decompress(data))
[docs]
def get_acm():
"""
Returns
-------
acm: list of dicts
Each dict is an ACM domain. It contains category name,
query (concatenation of names from domain and subdomains),
size (number of subdomains including itself), and children (list of domain dicts).
Examples
--------
>>> acm = get_acm()
>>> subdomain = acm[4]['children'][2]['children'][1]
>>> subdomain['name']
'Software development process management'
>>> subdomain['size']
10
>>> subdomain['query']
'Software development process management, Software development methods, Rapid application development, Agile software development, Capability Maturity Model, Waterfall model, Spiral model, V-model, Design patterns, Risk management'
>>> len(acm)
13
"""
return get_acm_from_package()
[docs]
def flatten_acm(acm, min_size=5, max_depth=100, exclude=None, depth=0):
"""
Select subdomains of an acm tree and return them as a list.
Parameters
----------
acm: list of dicts
acm tree from get_acm.
min_size: int
size threshold to select a domain (avoids small domains)
max_depth: int
depth threshold to select a domain (avoids deep domains)
exclude: list
list of domains to exclude from the results
Returns
-------
list
A flat list of domains described by name and query.
Example
-------
>>> acm = flatten_acm(get_acm())
>>> acm[111]['name']
'Graph theory'
"""
if exclude is None:
exclude = set()
result = [
{"name": t["name"], "query": t["query"]} for t in acm if t["size"] > min_size
]
for t in acm:
if len(t["children"]) > 0 and depth < max_depth:
result += flatten_acm(
t["children"], min_size=min_size, max_depth=max_depth, depth=depth + 1
)
if depth == 0:
result = [
{"name": key, "query": value}
for key, value in {t["name"]: t["query"] for t in result}.items()
if key not in exclude
]
return result