Source code for scrubadub_stanford.detectors.stanford

"""
This module provides a detector to detect people's names using the Stanford CRF NER tagger.

See https://nlp.stanford.edu/software/CRF-NER.html for more details on the Stanford CRF NER Tagger

This detector requires java and the python package `nltk`.
The Stanford CRF NER Tagger will be downloaded to `~/.scrubadub/stanford_ner` and takes around 250MB.
"""
import re
import os
import pathlib
import zipfile
import requests
try:
    import nltk
except ImportError:
    raise ImportError(
        'To use scrubadub.detectors.stanford extra dependencies need to be installed.\n'
        'Please run: pip install scrubadub[stanford]'
    )

from typing import Dict, Type, Optional, List

from scrubadub.detectors.catalogue import register_detector
from scrubadub.detectors.base import Detector
from scrubadub.filth.base import Filth
from scrubadub.filth.name import NameFilth
from scrubadub.filth.organization import OrganizationFilth
from scrubadub.filth.location import LocationFilth


class ScrubadubStanfordNERTagger(nltk.tag.StanfordNERTagger):
    """Utility class to control options that the StanfordNERTagger is run with"""
    @property
    def _cmd(self):
        return [
            "edu.stanford.nlp.ie.crf.CRFClassifier",
            "-loadClassifier",
            self._stanford_model,
            "-textFile",
            self._input_file_path,
            "-outputFormat",
            self._FORMAT,
            "-tokenizerFactory",
            "edu.stanford.nlp.process.WhitespaceTokenizer",
            "-tokenizerOptions",
            '"tokenizeNLs=false"',
            '-nthreads',
            '1',
        ]


[docs]class StanfordEntityDetector(Detector):
    """Search for people's names, organization's names and locations within text using the stanford 3 class model.

    The three classes of this model can be enabled with the three arguments to the inialiser `enable_person`,
    `enable_organization` and `enable_location`.
    An example of their usage is given below.

    >>> import scrubadub, scrubadub_stanford
    >>> detector = scrubadub_stanford.detectors.StanfordEntityDetector(
    ...     enable_person=False, enable_organization=False, enable_location=True
    ... )
    >>> scrubber = scrubadub.Scrubber(detector_list=[detector])
    >>> scrubber.clean('Jane is visiting London.')
    'Jane is visiting {{LOCATION}}.'
    """
    filth_cls = Filth
    name = "stanford"
    ignored_words = ["tennant"]

    # TODO: NER model Has been wrapped into coreNLP packagewhich has version 4.1.0 out now.
    #  The download script needs to be updated.
    # TODO: Add support for Spanish, German, Chinese, French (No Arabic NER model)
    stanford_version = "4.0.0"
    stanford_download_url = 'https://nlp.stanford.edu/software/stanford-ner-{version}.zip'

[docs]    def __init__(self, enable_person: bool = True, enable_organization: bool = True, enable_location: bool = False,
                 **kwargs):
        """Initialise the ``Detector``.

        :param name: Overrides the default name of the :class:``Detector``
        :type name: str, optional
        :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
                       underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
        :type locale: str, optional
        """
        self.stanford_tagger = None  # type: Optional[nltk.tag.StanfordNERTagger]

        self.filth_lookup = {}  # type: Dict[str, Type[Filth]]
        if enable_person:
            self.filth_lookup['PERSON'] = NameFilth
        if enable_organization:
            self.filth_lookup['ORGANIZATION'] = OrganizationFilth
        if enable_location:
            self.filth_lookup['LOCATION'] = LocationFilth

        self.stanford_classifier = os.path.join('stanford-ner-{version}', 'classifiers',
                                                'english.all.3class.distsim.crf.ser.gz')

        self.stanford_prefix = pathlib.Path.home().joinpath('.scrubadub').joinpath('stanford_ner').__str__()
        self.stanford_download_path = os.path.join(self.stanford_prefix, 'stanford-ner-{version}.zip')
        self.stanford_classifier_path = os.path.join(self.stanford_prefix, self.stanford_classifier)
        self.stanford_ner_jar_path = os.path.join(self.stanford_prefix, 'stanford-ner-{version}', 'stanford-ner.jar')

        self.stanford_files = [
            self.stanford_classifier,
            os.path.join('stanford-ner-{version}', 'stanford-ner.jar'),
            os.path.join('stanford-ner-{version}', 'stanford-ner-{version}.jar'),
            os.path.join('stanford-ner-{version}', 'stanford-ner-{version}-javadoc.jar'),
            os.path.join('stanford-ner-{version}', 'stanford-ner-{version}-sources.jar'),
        ]

        super(StanfordEntityDetector, self).__init__(**kwargs)

    def _check_downloaded(self):
        """Find out if the stanford NER tagger has already been downloaded"""
        paths = [
            os.path.join(self.stanford_prefix, file_name.format(version=self.stanford_version))
            for file_name in self.stanford_files
        ]
        for file_path in paths:
            if not os.path.exists(file_path):
                return False
        return True

    def _download(self):
        """Download and extract the eneeded files from the Stanford NER tagger"""
        # Make the data directory
        pathlib.Path(self.stanford_prefix).mkdir(parents=True, exist_ok=True)

        # Download the NER tagger
        download_path = self.stanford_download_path.format(version=self.stanford_version)
        if not pathlib.Path(download_path).exists():
            download_request = requests.get(self.stanford_download_url.format(version=self.stanford_version))
            with open(download_path, 'wb') as download_file:
                download_file.write(download_request.content)

        # Extract the needed files
        with zipfile.ZipFile(download_path, 'r') as downloaded_zip_file:
            for file_to_extract in self.stanford_files:
                downloaded_zip_file.extract(
                    member=file_to_extract.format(version=self.stanford_version),
                    path=self.stanford_prefix
                )

        # Ensure it extracted the files that we need
        if not self._check_downloaded():
            raise RuntimeError(
                "Unable to download the Stanford NER tagger from {url}, perhaps try again?".format(
                    url=self.stanford_download_url
                )
            )

[docs]    def iter_filth(self, text, document_name: Optional[str] = None):
        """Yields discovered filth in the provided ``text``.

        :param text: The dirty text to clean.
        :type text: str
        :param document_name: The name of the document to clean.
        :type document_name: str, optional
        :return: An iterator to the discovered :class:`Filth`
        :rtype: Iterator[:class:`Filth`]
        """
        if self.stanford_tagger is None:
            if not self._check_downloaded():
                self._download()

            self.stanford_tagger = ScrubadubStanfordNERTagger(
                self.stanford_classifier_path.format(version=self.stanford_version),
                self.stanford_ner_jar_path.format(version=self.stanford_version),
            )

        tokens = nltk.tokenize.word_tokenize(text)
        tags = self.stanford_tagger.tag(tokens)

        grouped_tags = {}  # type: Dict[str, List[str]]
        previous_tag = None

        # Loop over all tagged words and join contiguous words tagged as people
        for tag_text, tag_type in tags:
            if tag_type in self.filth_lookup.keys() and not any(
                    [tag_text.lower().strip() == ignored.lower().strip() for ignored in self.ignored_words]):
                if previous_tag == tag_type:
                    grouped_tags[tag_type][-1] = grouped_tags[tag_type][-1] + ' ' + tag_text
                else:
                    grouped_tags[tag_type] = grouped_tags.get(tag_type, []) + [tag_text]

                previous_tag = tag_type
            else:
                previous_tag = None

        # for each set of tags, de-dupe and convert to regex
        for tag_type, tag_list in grouped_tags.items():
            grouped_tags[tag_type] = [
                r'\b' + re.escape(person).replace(r'\ ', r'\s+') + r'\b'
                for person in set(tag_list)
            ]

        # Now look for these in the original document
        for tag_type, tag_list in grouped_tags.items():
            for tag_regex in tag_list:
                try:
                    pattern = re.compile(tag_regex, re.MULTILINE | re.UNICODE)
                except re.error:
                    print(tag_regex)
                    raise
                found_strings = re.finditer(pattern, text)

                # Iterate over each found string matching this regex and yield some filth
                for instance in found_strings:
                    yield self.filth_lookup[tag_type](
                        beg=instance.start(),
                        end=instance.end(),
                        text=instance.group(),
                        detector_name=self.name,
                        document_name=document_name,
                        locale=self.locale,
                    )

[docs]    @classmethod
    def supported_locale(cls, locale: str) -> bool:
        """Returns true if this ``Detector`` supports the given locale.

        :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
                       underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
        :type locale: str
        :return: ``True`` if the locale is supported, otherwise ``False``
        :rtype: bool
        """
        language, region = cls.locale_split(locale)
        return language in ['en']


register_detector(StanfordEntityDetector)

__all__ = ["StanfordEntityDetector"]