Source code for gismo.datasets.acm

import requests
import urllib
import pkgutil
import gzip
import json

from bs4 import BeautifulSoup as bs


def get_acm_from_package():
    data = pkgutil.get_data(__package__, 'acm.json.gz')
    return json.loads(gzip.decompress(data))


def rec_classification(domain):
    result = {'name': domain.find('a').text}
    ul = domain.find('ul')
    subdomains = ul.findChildren('li', recursive=False) if ul else []
    result['children'] = [rec_classification(sd) for sd in subdomains]
    children_query = ", ".join([r['query'] for r in result['children']])
    jonction = ", " if children_query else ""
    result['query'] = f"{result['name']}{jonction}{children_query}"
    result['size'] = 1 + sum([r['size'] for r in result['children']])
    return result


def get_acm_from_internet():
    s = requests.session()
    r = s.get('https://dl.acm.org/ccs')
    soup = bs(r.text, features="lxml")
    wid = soup.find('input', {'id': 'widgetID'})['value']
    pbc = soup.find('meta', {'name': 'pbContext'})['content']
    params = urllib.parse.urlencode({'widgetId': wid, 'pbContext': pbc}
                                    , quote_via=urllib.parse.quote)
    url = f"https://dl.acm.org/pb/widgets/acmCCS/fetchFlatView?{params}"
    r = s.get(url)
    soup = bs(r.text, features="lxml")
    domains = soup.find('div').find('div').findChildren('li', recursive=False)
    acm = [rec_classification(d) for d in domains]
    return acm


[docs]def get_acm(refresh=False): """ Parameters ---------- refresh: bool If ``True``, builds a new forest from the Internet, otherwise use a static version. Returns ------- acm: list of dicts Each dict is an ACM domain. It contains category name, query (concatenation of names from domain and subdomains), size (number of subdomains including itself), and children (list of domain dicts). Examples -------- >>> acm = get_acm() >>> subdomain = acm[4]['children'][2]['children'][1] >>> subdomain['name'] 'Software development process management' >>> subdomain['size'] 10 >>> subdomain['query'] 'Software development process management, Software development methods, Rapid application development, Agile software development, Capability Maturity Model, Waterfall model, Spiral model, V-model, Design patterns, Risk management' >>> acm = get_acm(refresh=True) >>> len(acm) 13 """ if refresh: return get_acm_from_internet() else: return get_acm_from_package()
[docs]def flatten_acm(acm, min_size=5, max_depth=100, exclude=None, depth=0): """ Select subdomains of an acm tree and return them as a list. Parameters ---------- acm: list of dicts acm tree from get_acm. min_size: int size threshold to select a domain (avoids small domains) max_depth: int depth threshold to select a domain (avoids deep domains) exclude: list list of domains to exclude from the results Returns ------- list A flat list of domains described by name and query. Example ------- >>> acm = flatten_acm(get_acm()) >>> acm[111]['name'] 'Graph theory' """ if exclude is None: exclude = set() result = [{'name': t['name'], 'query': t['query']} for t in acm if t['size'] > min_size] for t in acm: if len(t['children']) > 0 and depth < max_depth: result += flatten_acm(t['children'], min_size=min_size, max_depth=max_depth, depth=depth + 1) if depth == 0: result = [{'name': key, 'query': value} for key, value in {t['name']: t['query'] for t in result}.items() if key not in exclude] return result