import io
from pathlib import Path
import requests
import zlib
import json
import dill as pickle
import numpy as np
import gzip
from lxml import etree
URL = "https://dblp.uni-trier.de/xml/dblp.xml.gz"
"""
URL of the full DBLP database.
"""
DTD_URL = "https://dblp.uni-trier.de/xml/dblp.dtd"
"""
URL of the dtd file (required to correctly parse non-ASCII characters).
"""
DEFAULT_FIELDS = {'type', 'title', 'authors', 'venue', 'year'}
"""
Default fields to extract.
"""
LIST_TYPE_FIELDS = {'urls', 'authors'}
"""
DBLP fields with possibly multiple entries.
"""
FIELD_REDIRECTION = {'journal': 'venue',
'booktitle': 'venue',
'author': 'authors',
'ee': 'urls'
}
[docs]def fast_iter(context, func, d=2, **kwargs):
"""
Applies ``func`` to all xml elements of depth 1 of the xml parser ``context``. `
``**kwargs`` are passed to ``func``.
Modified version of a modified version of Liza Daly's fast_iter
Inspired by
https://stackoverflow.com/questions/4695826/efficient-way-to-iterate-through-xml-elements
Parameters
----------
context: XMLparser
A parser obtained from etree.iterparse
func: function
How to process the elements
d: int, optional
Depth to process elements.
"""
depth = 0
for event, elem in context:
if event == 'start':
depth += 1
if event == 'end':
depth -= 1
if depth < d:
func(elem, **kwargs)
# It's safe to call clear() here because no descendants will be
# accessed
elem.clear()
# Also eliminate now-empty references from the root node to elem
for ancestor in elem.xpath('ancestor-or-self::*'):
while ancestor.getprevious() is not None:
del ancestor.getparent()[0]
del context
[docs]def xml_element_to_dict(elt, fields):
"""
Converts the xml element ``elt`` into a dict if it is a paper.
Parameters
----------
elt: Any
a XML element.
fields: set
Set of entries to retrieve.
Returns
-------
dict or None
Article dictionary if element contains the attributes of an article, None otherwise.
"""
children = elt.getchildren()
if not children:
return None
dic = {"type": elt.tag} if "type" in fields else dict()
for c in children:
value = c.text
key = c.tag
key = FIELD_REDIRECTION.get(key, key)
if key not in fields or not isinstance(value, str):
continue
if key in LIST_TYPE_FIELDS:
dic.setdefault(key,[]).append(value)
else:
dic[key] = value
if not dic.get('authors') or not all(key in dic for key in ['year', 'title', 'venue']):
return None
return dic
[docs]def element_to_source(elt, source, fields):
"""
Test if elt is an article, converts it to dictionary and appends to source
Parameters
----------
elt: Any
a XML element.
source: list
the source in construction.
fields: set
Set of fields to retrieve.
"""
dic = xml_element_to_dict(elt, fields)
if dic is not None:
source.append(dic)
[docs]def url2source(url, fields=None):
"""
Directly transform URL of a dblp xml into a list of dictionnary.
Only use for datasets that fit into memory (e.g. articles from one author).
If the dataset does not fit, consider using the Dblp class instead.
Parameters
----------
url: str
the URL to fetch.
fields: set
Set of DBLP fields to capture.
Returns
-------
source: list of dict
Articles retrieved from the URL
Example
-------
>>> source = url2source("https://dblp.org/pers/xx/t/Tixeuil:S=eacute=bastien.xml", fields={'authors', 'title', 'year', 'venue', 'urls'})
>>> art = [s for s in source if s['title']=="Distributed Computing with Mobile Robots: An Introductory Survey."][0]
>>> art['authors']
['Maria Potop-Butucaru', 'Michel Raynal', 'Sébastien Tixeuil']
>>> art['urls']
['https://doi.org/10.1109/NBiS.2011.55', 'http://doi.ieeecomputersociety.org/10.1109/NBiS.2011.55']
"""
if fields is None:
fields = DEFAULT_FIELDS
r = requests.get(url)
source = []
with io.BytesIO(r.content) as f:
context = etree.iterparse(f, events=('start', 'end',))
fast_iter(context, element_to_source, d=3, source=source, fields=fields)
return source
[docs]def element_to_filesource(elt, data_handler, index, fields):
"""
* Converts the xml element ``elt`` into a dict if it is an article.
* Compress and write the dict in ``data_handler``
* Append file position in ``data_handler`` to ``index``.
Parameters
----------
elt: Any
a XML element.
data_handler: file_descriptor
Where the compressed data will be stored. Must be writable.
index:
a list that contains the initial position of the data_handler for all previously processed elements.
fields: set
Set of fields to retrieve.
Returns
-------
bool
Always return True for compatibility with the xml parser.
"""
dic = xml_element_to_dict(elt=elt, fields=fields)
if dic is None:
return True
data_handler.write(zlib.compress(json.dumps(dic).encode('utf8')))
index.append(data_handler.tell())
return True
[docs]class Dblp:
"""
The DBLP class can download DBLP database and produce source files compatible with the :class:`~gismo.filesource.FileSource` class.
Parameters
----------
dblp_url: str, optional
Alternative URL for the dblp.xml.gz file
filename: str
Stem of the files (suffixes will be appened)
path: str or path, optional
Destination of the files
"""
def __init__(self, dblp_url=URL, filename="dblp",
path="."):
self.dblp_url = dblp_url
self.path = Path(path)
self.dblp_xml = self.path / Path(f"{filename}.xml.gz")
self.dblp_data = self.path / Path(f"{filename}.data")
self.dblp_index = self.path / Path(f"{filename}.index")
self.xml_handler = None
self.json_handler = None
self._index = None
def download(self):
r = requests.get(self.dblp_url, stream=True)
if self.dblp_url.endswith("gz"):
with open(self.dblp_xml, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
else:
with gzip.open(self.dblp_xml, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
print(f"DBLP database downloaded to {self.dblp_xml}.")
[docs] def build(self, refresh=False, d=2, fields=None):
"""
Main class method. Create the data and index files.
Parameters
----------
refresh: bool
Tell if files are to be rebuilt if they are already there.
d: int
depth level where articles are. Usually 2 or 3 (2 for the main database).
fields: set, optional
Set of fields to collect. Default to :obj:`~gismo.datasets.dblp.DEFAULT_FIELDS`.
Example
-------
By default, the class downloads the full dataset. Here we will limit to one entry.
>>> toy_url = "https://dblp.org/pers/xx/m/Mathieu:Fabien.xml"
>>> import tempfile
>>> from gismo.filesource import FileSource
>>> tmp = tempfile.TemporaryDirectory()
>>> dblp = Dblp(dblp_url=toy_url, path=tmp.name)
>>> dblp.build() # doctest.ELLIPSIS
Retrieve https://dblp.org/pers/xx/m/Mathieu:Fabien.xml from the Internet.
DBLP database downloaded to ...xml.gz.
Converting DBLP database from ...xml.gz (may take a while).
Building Index.
Conversion done.
By default, build uses existing files.
>>> dblp.build() # doctest.ELLIPSIS
File ...xml.gz already exists. Use refresh option to overwrite.
File ...data already exists. Use refresh option to overwrite.
The refresh parameter can be used to ignore existing files.
>>> dblp.build(d=3, refresh=True) # doctest.ELLIPSIS
Retrieve https://dblp.org/pers/xx/m/Mathieu:Fabien.xml from the Internet.
DBLP database downloaded to ...xml.gz.
Converting DBLP database from ...xml.gz (may take a while).
Building Index.
Conversion done.
The resulting files can be used to create a FileSource.
>>> source = FileSource(filename="dblp", path=tmp.name)
>>> art = [s for s in source if s['title']=="Can P2P networks be super-scalable?"][0]
>>> art['authors'] # doctest.ELLIPSIS
['François Baccelli', 'Fabien Mathieu', 'Ilkka Norros', 'Rémi Varloot']
Don't forget to close source after use.
>>> source.close()
>>> tmp.cleanup()
"""
if fields is None:
fields = DEFAULT_FIELDS
if self.dblp_xml.exists() and not refresh:
print(f"File {self.dblp_xml} already exists. Use refresh option to overwrite.")
else:
print(f"Retrieve {self.dblp_url} from the Internet.")
self.download()
if self.dblp_data.exists() and not refresh:
print(f"File {self.dblp_data} already exists. Use refresh option to overwrite.")
else:
print(f"Converting DBLP database from {self.dblp_xml} (may take a while).")
# Download the DTD parser
r = requests.get("https://dblp.uni-trier.de/xml/dblp.dtd")
with open(self.path / Path("dblp.dtd"), 'w') as f:
f.write(r.text)
with gzip.open(self.dblp_xml, "rb") as f:
index = [0]
with open(self.dblp_data, "wb") as g:
context = etree.iterparse(f, events=('start', 'end',), load_dtd=True)
fast_iter(context, element_to_filesource, d=d, data_handler=g, index=index, fields=fields)
print(f"Building Index.")
with open(self.dblp_index, "wb") as g:
pickle.dump(np.array(index), g)
print(f"Conversion done.")