Source code for scrubadub_stanford.detectors.stanford

"""
This module provides a detector to detect people's names using the Stanford CRF NER tagger.

See https://nlp.stanford.edu/software/CRF-NER.html for more details on the Stanford CRF NER Tagger

This detector requires java and the python package `nltk`.
The Stanford CRF NER Tagger will be downloaded to `~/.scrubadub/stanford_ner` and takes around 250MB.
"""
import re
import os
import pathlib
import zipfile
import requests
try:
    import nltk
except ImportError:
    raise ImportError(
        'To use scrubadub.detectors.stanford extra dependencies need to be installed.\n'
        'Please run: pip install scrubadub[stanford]'
    )

from typing import Dict, Type, Optional, List

from scrubadub.detectors.catalogue import register_detector
from scrubadub.detectors.base import Detector
from scrubadub.filth.base import Filth
from scrubadub.filth.name import NameFilth
from scrubadub.filth.organization import OrganizationFilth
from scrubadub.filth.location import LocationFilth


class ScrubadubStanfordNERTagger(nltk.tag.StanfordNERTagger):
    """Utility class to control options that the StanfordNERTagger is run with"""
    @property
    def _cmd(self):
        return [
            "edu.stanford.nlp.ie.crf.CRFClassifier",
            "-loadClassifier",
            self._stanford_model,
            "-textFile",
            self._input_file_path,
            "-outputFormat",
            self._FORMAT,
            "-tokenizerFactory",
            "edu.stanford.nlp.process.WhitespaceTokenizer",
            "-tokenizerOptions",
            '"tokenizeNLs=false"',
            '-nthreads',
            '1',
        ]


[docs]class StanfordEntityDetector(Detector): """Search for people's names, organization's names and locations within text using the stanford 3 class model. The three classes of this model can be enabled with the three arguments to the inialiser `enable_person`, `enable_organization` and `enable_location`. An example of their usage is given below. >>> import scrubadub, scrubadub_stanford >>> detector = scrubadub_stanford.detectors.StanfordEntityDetector( ... enable_person=False, enable_organization=False, enable_location=True ... ) >>> scrubber = scrubadub.Scrubber(detector_list=[detector]) >>> scrubber.clean('Jane is visiting London.') 'Jane is visiting {{LOCATION}}.' """ filth_cls = Filth name = "stanford" ignored_words = ["tennant"] # TODO: NER model Has been wrapped into coreNLP packagewhich has version 4.1.0 out now. # The download script needs to be updated. # TODO: Add support for Spanish, German, Chinese, French (No Arabic NER model) stanford_version = "4.0.0" stanford_download_url = 'https://nlp.stanford.edu/software/stanford-ner-{version}.zip'
[docs] def __init__(self, enable_person: bool = True, enable_organization: bool = True, enable_location: bool = False, **kwargs): """Initialise the ``Detector``. :param name: Overrides the default name of the :class:``Detector`` :type name: str, optional :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an underscore and the two letter upper-case country code, eg "en_GB" or "de_CH". :type locale: str, optional """ self.stanford_tagger = None # type: Optional[nltk.tag.StanfordNERTagger] self.filth_lookup = {} # type: Dict[str, Type[Filth]] if enable_person: self.filth_lookup['PERSON'] = NameFilth if enable_organization: self.filth_lookup['ORGANIZATION'] = OrganizationFilth if enable_location: self.filth_lookup['LOCATION'] = LocationFilth self.stanford_classifier = os.path.join('stanford-ner-{version}', 'classifiers', 'english.all.3class.distsim.crf.ser.gz') self.stanford_prefix = pathlib.Path.home().joinpath('.scrubadub').joinpath('stanford_ner').__str__() self.stanford_download_path = os.path.join(self.stanford_prefix, 'stanford-ner-{version}.zip') self.stanford_classifier_path = os.path.join(self.stanford_prefix, self.stanford_classifier) self.stanford_ner_jar_path = os.path.join(self.stanford_prefix, 'stanford-ner-{version}', 'stanford-ner.jar') self.stanford_files = [ self.stanford_classifier, os.path.join('stanford-ner-{version}', 'stanford-ner.jar'), os.path.join('stanford-ner-{version}', 'stanford-ner-{version}.jar'), os.path.join('stanford-ner-{version}', 'stanford-ner-{version}-javadoc.jar'), os.path.join('stanford-ner-{version}', 'stanford-ner-{version}-sources.jar'), ] super(StanfordEntityDetector, self).__init__(**kwargs)
def _check_downloaded(self): """Find out if the stanford NER tagger has already been downloaded""" paths = [ os.path.join(self.stanford_prefix, file_name.format(version=self.stanford_version)) for file_name in self.stanford_files ] for file_path in paths: if not os.path.exists(file_path): return False return True def _download(self): """Download and extract the eneeded files from the Stanford NER tagger""" # Make the data directory pathlib.Path(self.stanford_prefix).mkdir(parents=True, exist_ok=True) # Download the NER tagger download_path = self.stanford_download_path.format(version=self.stanford_version) if not pathlib.Path(download_path).exists(): download_request = requests.get(self.stanford_download_url.format(version=self.stanford_version)) with open(download_path, 'wb') as download_file: download_file.write(download_request.content) # Extract the needed files with zipfile.ZipFile(download_path, 'r') as downloaded_zip_file: for file_to_extract in self.stanford_files: downloaded_zip_file.extract( member=file_to_extract.format(version=self.stanford_version), path=self.stanford_prefix ) # Ensure it extracted the files that we need if not self._check_downloaded(): raise RuntimeError( "Unable to download the Stanford NER tagger from {url}, perhaps try again?".format( url=self.stanford_download_url ) )
[docs] def iter_filth(self, text, document_name: Optional[str] = None): """Yields discovered filth in the provided ``text``. :param text: The dirty text to clean. :type text: str :param document_name: The name of the document to clean. :type document_name: str, optional :return: An iterator to the discovered :class:`Filth` :rtype: Iterator[:class:`Filth`] """ if self.stanford_tagger is None: if not self._check_downloaded(): self._download() self.stanford_tagger = ScrubadubStanfordNERTagger( self.stanford_classifier_path.format(version=self.stanford_version), self.stanford_ner_jar_path.format(version=self.stanford_version), ) tokens = nltk.tokenize.word_tokenize(text) tags = self.stanford_tagger.tag(tokens) grouped_tags = {} # type: Dict[str, List[str]] previous_tag = None # Loop over all tagged words and join contiguous words tagged as people for tag_text, tag_type in tags: if tag_type in self.filth_lookup.keys() and not any( [tag_text.lower().strip() == ignored.lower().strip() for ignored in self.ignored_words]): if previous_tag == tag_type: grouped_tags[tag_type][-1] = grouped_tags[tag_type][-1] + ' ' + tag_text else: grouped_tags[tag_type] = grouped_tags.get(tag_type, []) + [tag_text] previous_tag = tag_type else: previous_tag = None # for each set of tags, de-dupe and convert to regex for tag_type, tag_list in grouped_tags.items(): grouped_tags[tag_type] = [ r'\b' + re.escape(person).replace(r'\ ', r'\s+') + r'\b' for person in set(tag_list) ] # Now look for these in the original document for tag_type, tag_list in grouped_tags.items(): for tag_regex in tag_list: try: pattern = re.compile(tag_regex, re.MULTILINE | re.UNICODE) except re.error: print(tag_regex) raise found_strings = re.finditer(pattern, text) # Iterate over each found string matching this regex and yield some filth for instance in found_strings: yield self.filth_lookup[tag_type]( beg=instance.start(), end=instance.end(), text=instance.group(), detector_name=self.name, document_name=document_name, locale=self.locale, )
[docs] @classmethod def supported_locale(cls, locale: str) -> bool: """Returns true if this ``Detector`` supports the given locale. :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an underscore and the two letter upper-case country code, eg "en_GB" or "de_CH". :type locale: str :return: ``True`` if the locale is supported, otherwise ``False`` :rtype: bool """ language, region = cls.locale_split(locale) return language in ['en']
register_detector(StanfordEntityDetector) __all__ = ["StanfordEntityDetector"]