Source code for gismo.sentencizer

from spacy.lang.en import English

from gismo.corpus import Corpus
from gismo.embedding import Embedding
from gismo.gismo import Gismo


[docs]class Sentencizer: """ The Sentencizer class allows to refine a document-level gismo into a sentence-level gismo. A simple sentence extraction is proposed. For more complex usages, the class can provide a full :py:class:`~gismo.gismo.Gismo` instance that operates at sentence-level. Parameters ---------- gismo: Gismo Document-level Gismo. Examples --------- We use the C50 Reuters dataset (5000 news paragraphs). >>> from gismo.datasets.reuters import get_reuters_news >>> corpus = Corpus(get_reuters_news(), to_text=lambda e: e['content']) >>> embedding = Embedding() >>> embedding.fit_transform(corpus) >>> gismo = Gismo(corpus, embedding) >>> sentencer = Sentencizer(gismo) First Example: run explicitly the query *Orange* at document-level, extract 4 covering sentences with narrow bfs. >>> success = gismo.rank("Orange") >>> sentencer.get_sentences(s=4, wide=False) # doctest: +NORMALIZE_WHITESPACE ['Snook says the all important average retained revenue per Orange subscriber will rise from around 442 pounds per year, partly because dominant telecoms player British Telecommunications last month raised the price of a call to Orange phones from its fixed lines.', 'Analysts said that Orange shares had good upside potential after a rollercoaster ride in their short time on the market.', 'Orange, which was floated last March at 205 pence per share, initially saw its stock slump to 157.5 pence before recovering over the last few months to trade at 218 on Tuesday, a rise of four pence on the day.', 'One-2-One and Orange ORA.L, which offer only digital services, are due to release their connection figures next week.'] Second example: extract *Ericsson*-related sentences >>> sentencer.get_sentences(query="Ericsson") # doctest: +NORMALIZE_WHITESPACE ['These latest wins follow a recent $350 million contract win with Telefon AB L.M. Ericsson, bolstering its already strong activity in the contract manufacturing of telecommuncation and data communciation products, he said.', 'The restraints are few in areas such as consumer products, while in sectors such as banking, distribution and insurance, foreign firms are kept on a very tight leash.', "The company also said it had told analysts in a briefing Tuesday of new contract wins with Ascend Communications Inc, Harris Corp's Communications unit and Philips Electronics NV.", 'Pocket is the first from the high-priced 1996 auction known to have filed for bankruptcy protection.', 'With Ascend in particular, he said the company would be manufacturing the company\\'s mainstream MAX TNT remote access network equipment. "'] Third example: extract *Communications*-related sentences from a string. >>> txt = gismo.corpus[4517]['content'] >>> sentencer.get_sentences(query="communications", txt=txt) # doctest: +NORMALIZE_WHITESPACE ["Privately-held Pocket's big creditors include a group of Asian entrepreneurs and communications-equipment makers Siemens AG of Germany and L.M. Ericsson of Sweden.", "2 bidder at the government's high-flying wireless phone auction last year has filed for bankruptcy protection from its creditors, underscoring the problems besetting the auction's winners.", "The Federal Communications Commission on Monday gave PCS companies from last year's auction some breathing space when it suspended indefinitely a March 31 deadline for them to make payments to the agency for their licenses."] """ def __init__(self, gismo): self.parser = English() self.parser.add_pipe('sentencizer') self.doc_gismo = gismo self.sent_corpus = None self.sent_gismo = None
[docs] def splitter(self, txt): """ Transform input content into a corpus of sentences stored into the :py:attr:`sent_corpus` attribute. Parameters ---------- txt: str or list Text or list of documents to split in sentences. For the latter, documents are assumed to be provided as `(content, id)` pairs, where `content` is the actual text and `id` a reference of the document. Returns ------- Sentencizer """ if type(txt) is str: source = [str(sent).strip() for sent in self.parser(txt).sents if len(sent) > 10] self.sent_corpus = Corpus(source, to_text=lambda x: x) else: source = [{'source': source[1], 'content': str(sent).strip()} for source in txt for sent in self.parser(source[0]).sents if len(sent) > 10] self.sent_corpus = Corpus(source, to_text=lambda x: x['content']) return self
[docs] def make_sent_gismo(self, query=None, txt=None, k=None, **kwargs): """ Construct a sentence-level Gismo stored in the :py:attr:`sent_gismo` attribute. Parameters ---------- query: str (optional) Query to run on the document-level Gismo. txt: str (optional) Text to use for sentence extraction. If not set, the sentences will be extracted from the top-documents. k: int (optional) Number of top-documents used for the built. If not set, the :py:func:`~gismo.common.auto_k` heuristic will be used. kwargs: dict Custom default runtime parameters to pass to the sentence-level Gismo. You just need to specify the parameters that differ from :obj:`~gismo.parameters.DEFAULT_PARAMETERS`. Note that distortion will be automatically de-activated. If you really want it, manually change the value of ``self.sent_gismo.parameters.distortion`` afterwards. Returns ------- Sentencizer """ if txt is None: if query is not None: self.doc_gismo.rank(query) txt = [(self.doc_gismo.corpus.to_text(self.doc_gismo.corpus[i]), i) for i in self.doc_gismo.get_documents_by_rank(k, post=False)] self.splitter(txt) local_embedding = Embedding() local_embedding.fit_ext(self.doc_gismo.embedding) local_embedding.transform(self.sent_corpus) self.sent_gismo = Gismo(self.sent_corpus, local_embedding, **kwargs) self.sent_gismo.parameters.distortion = 0.0 self.sent_gismo.post_documents_item = lambda g, i: g.corpus.to_text(g.corpus[i]) return self
[docs] def get_sentences(self, query=None, txt=None, k=None, s=None, resolution=.7, stretch=2.0, wide=True, post=True): """ All-in-one method to extract covering sentences from the corpus. Computes sentence-level corpus, sentence-level gismo, and calls :py:meth:`~gismo.gismo.Gismo.get_documents_by_coverage`. Parameters ---------- query: str (optional) Query to run on the document-level Gismo txt: str (optional) Text to use for sentence extraction. If not set, the sentences will be extracted from the top-documents. k: int (optional) Number of top-documents used for the built. If not set, the :py:func:`~gismo.common.auto_k` heuristic of the document-level Gismo will be used. s: int (optional) Number of sentences to return. If not set, the :py:func:`~gismo.common.auto_k` heuristic of the sentence-level Gismo will be used. resolution: float (optional) Tree resolution passed to the :py:meth:`~gismo.gismo.Gismo.get_documents_by_coverage` method. stretch: float >= 1 (optional) Stretch factor passed to the :py:meth:`~gismo.gismo.Gismo.get_documents_by_coverage` method. wide: bool (optional) bfs wideness passed to the :py:meth:`~gismo.gismo.Gismo.get_documents_by_coverage` method. post: bool (optional) Use of post-proccessing passed to the :py:meth:`~gismo.gismo.Gismo.get_documents_by_coverage` method. Returns ------- list """ self.make_sent_gismo(query=query, txt=txt, k=k) if query is None: query = self.doc_gismo.embedding._query self.sent_gismo.rank(query) return self.sent_gismo.get_documents_by_coverage(k=s, resolution=resolution, stretch=stretch, wide=wide, post=post)