Source code for gismo.datasets.reuters

import requests
from zipfile import ZipFile
import io

# URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00217/C50.zip"
URL = "https://github.com/balouf/datasets/raw/main/C50.zip"



[docs]
def get_reuters_entry(name, z):
    """
    Read the Reuters news referenced by `name` in the zip archive `z` and returns it as a dict.

    Parameters
    ----------
    name: str
        Location of the file inside the Reuters archive
    z: ZipFile
        Zipfile descriptor of the Reuters archive

    Returns
    -------
    entry: dict
        dict with keys `set` (`C50test` or `c50train`), `author`, `id`, and `content`
    """
    with z.open(name) as f:
        description = name.split("/")
        return {
            "set": description[0],
            "author": description[1],
            "id": description[2][:-4],
            "content": f.read().decode(),
        }




[docs]
def get_reuters_news(url=URL):
    """
    Returns a list of news from the Reuters C50 news datasets

    Acknowledgments

    Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml].
    Irvine, CA: University of California, School of Information and Computer Science.

    ZhiLiu, e-mail: liuzhi8673 '@' gmail.com,
    institution: National Engineering Research Center for E-Learning, Hubei Wuhan, China

    Parameters
    ----------
    url: str
        Location of the C50 dataset

    Returns
    -------
    list
        The C50 news as a list of dict

    Example
    ---------
    Cf :py:class:`~gismo.sentencizer.Sentencizer`
    """
    r = requests.get(url)
    with ZipFile(io.BytesIO(r.content)) as z:
        return [
            get_reuters_entry(name, z) for name in z.namelist() if name.endswith(".txt")
        ]