Source code for gismo.datasets.reuters

import requests
from zipfile import ZipFile
import io

# URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00217/C50.zip"
URL = "https://github.com/balouf/datasets/raw/main/C50.zip"

[docs]def get_reuters_entry(name, z): """ Read the Reuters news referenced by `name` in the zip archive `z` and returns it as a dict. Parameters ---------- name: str Location of the file inside the Reuters archive z: ZipFile Zipfile descriptor of the Reuters archive Returns ------- entry: dict dict with keys `set` (`C50test` or `c50train`), `author`, `id`, and `content` """ with z.open(name) as f: description = name.split("/") return {'set': description[0], 'author': description[1], 'id': description[2][:-4], 'content': f.read().decode()}
[docs]def get_reuters_news(url=URL): """ Returns a list of news from the Reuters C50 news datasets Acknowledgments Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science. ZhiLiu, e-mail: liuzhi8673 '@' gmail.com, institution: National Engineering Research Center for E-Learning, Hubei Wuhan, China Parameters ---------- url: str Location of the C50 dataset Returns ------- list The C50 news as a list of dict Example --------- Cf :py:class:`~gismo.sentencizer.Sentencizer` """ r=requests.get(url) with ZipFile(io.BytesIO(r.content)) as z: return [get_reuters_entry(name, z) for name in z.namelist() if name.endswith('.txt')]