Source code for scrubadub.detectors.base

import re
import warnings
from typing import Optional, ClassVar, Type, Generator, Pattern, Dict, Sequence

from ..filth import Filth
from ..import utils


[docs]class Detector(object): """This is the base class for all detectors. A simple example of how to make a new detector is given below: .. code:: pycon >>> import scrubadub >>> class MyFilth(scrubadub.filth.Filth): ... type = 'mine' >>> class MyDetector(scrubadub.detectors.Detector): ... name = 'my_fr_detector' ... def iter_filth(self, text, document_name=None): ... # This detector always returns this same Filth no matter the input. ... # You should implement something better here. ... yield MyFilth(beg=0, end=8, text='My stuff', document_name=document_name, detector_name=self.name) >>> scrubber = scrubadub.Scrubber() >>> scrubber.add_detector(MyDetector) >>> text = "My stuff can be found there." >>> scrubber.clean(text) '{{MINE}} can be found there.' You can also advertise a ``Detector`` as supporting a certain locale by defining the ```Detector.supported_local()``` function. """ filth_cls = Filth # type: ClassVar[Type[Filth]] name = 'detector' # type: str autoload = False # type: bool
[docs] def __init__(self, name: Optional[str] = None, locale: str = 'en_US'): """Initialise the ``Detector``. :param name: Overrides the default name of the :class:``Detector`` :type name: str, optional :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an underscore and the two letter upper-case country code, eg "en_GB" or "de_CH". :type locale: str, optional """ if getattr(self, 'name', 'detector') == 'detector' and getattr(self, 'filth_cls', None) is not None: if getattr(self.filth_cls, 'type', None) is not None and type(self) != Detector: self.name = self.filth_cls.type warnings.warn( "Setting the detector name from the filth_cls.type is depreciated, please declare an explicit name" "attribute on the class.", DeprecationWarning ) if name is not None: self.name = name self.locale = locale self.language, self.region = self.locale_split(locale) if hasattr(self, 'supported_locale'): if not self.supported_locale(locale=locale): # type: ignore warnings.warn("Detector {} does not support the locale '{}'.".format(self.name, locale))
locale_transform = staticmethod(utils.locale_transform) locale_split = staticmethod(utils.locale_split)
[docs] def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]: """Yields discovered filth in the provided ``text``. :param text: The dirty text to clean. :type text: str :param document_name: The name of the document to clean. :type document_name: str, optional :return: An iterator to the discovered :class:`Filth` :rtype: Iterator[:class:`Filth`] """ raise NotImplementedError('must be implemented in derived classes')
[docs] def iter_filth_documents(self, document_list: Sequence[str], document_names: Sequence[Optional[str]]) -> Generator[Filth, None, None]: """Yields discovered filth in a list of documents. :param document_list: A list of documents to clean. :type document_list: List[str] :param document_names: A list containing the name of each document. :type document_names: List[str] :return: An iterator to the discovered :class:`Filth` :rtype: Iterator[:class:`Filth`] """ raise NotImplementedError('must be implemented in derived classes')
[docs]class RegexDetector(Detector): """Base class to match PII with a regex. This class requires that the ``filth_cls`` attribute be set to the class of the ``Filth`` that should be returned by this ``Detector``. .. code:: pycon >>> import re, scrubadub >>> class NewUrlDetector(scrubadub.detectors.RegexDetector): ... name = 'new_url_detector' ... filth_cls = scrubadub.filth.url.UrlFilth ... regex = re.compile(r'https.*$', re.IGNORECASE) >>> scrubber = scrubadub.Scrubber(detector_list=[NewUrlDetector()]) >>> text = u"This url will be found https://example.com" >>> scrubber.clean(text) 'This url will be found {{URL}}' """ regex = None # type: Optional[Pattern[str]] filth_cls = Filth # type: ClassVar[Type[Filth]]
[docs] def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]: """Yields discovered filth in the provided ``text``. :param text: The dirty text to clean. :type text: str :param document_name: The name of the document to clean. :type document_name: str, optional :return: An iterator to the discovered :class:`Filth` :rtype: Iterator[:class:`Filth`] """ if not issubclass(self.filth_cls, Filth): raise TypeError( 'filth_cls attribute of {} needs to be set to a subclass of the Filth class.'.format(self.__class__) ) # Allow the regex to be in the detector as well as the filth class if self.regex is None: warnings.warn('regex should be defined in the Detector and not in the Filth class', DeprecationWarning) if self.filth_cls.regex is not None: self.regex = self.filth_cls.regex if self.regex is None: raise ValueError('No regular expression has been specified for {}.'.format(self.__class__)) for match in self.regex.finditer(text): yield self.filth_cls(match=match, detector_name=self.name, document_name=document_name, locale=self.locale)
[docs]class RegionLocalisedRegexDetector(RegexDetector): """Detector to detect ``Filth`` localised using regular expressions localised by the region""" region_regex = {} # type: Dict[str, Pattern]
[docs] def __init__(self, **kwargs): """Initialise the ``Detector``. :param name: Overrides the default name of the :class:``Detector`` :type name: str, optional :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an underscore and the two letter upper-case country code, eg "en_GB" or "de_CH". :type locale: str, optional """ super(RegionLocalisedRegexDetector, self).__init__(**kwargs) # This will never be matched to anything. # It says anything where the next char is not an "a" character, but that is an "a". # This so the detector wont return filth if it doesn't have the region's correct regex. self.regex = re.compile(r'(?!a)a') if self.region in self.region_regex: self.regex = self.region_regex[self.region]
[docs] @classmethod def supported_locale(cls, locale: str) -> bool: """Returns true if this ``Detector`` supports the given locale. :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an underscore and the two letter upper-case country code, eg "en_GB" or "de_CH". :type locale: str :return: ``True`` if the locale is supported, otherwise ``False`` :rtype: bool """ language, region = cls.locale_split(locale) return region in cls.region_regex.keys()