Source code for scrubadub.detectors.user_supplied

from typing import Optional

from scrubadub.detectors.catalogue import register_detector
from .. import filth as filth_module
from ..filth.base import Filth
from .tagged import TaggedEvaluationFilthDetector


[docs]@register_detector class UserSuppliedFilthDetector(TaggedEvaluationFilthDetector): """Use this ``Detector`` to find some known filth in the text. An example might be if you have a list of employee numbers that you wish to remove from a document, as shown below: >>> import scrubadub >>> scrubber = scrubadub.Scrubber(detector_list=[ ... scrubadub.detectors.UserSuppliedFilthDetector([ ... {'match': 'Anika', 'filth_type': 'name'}, ... {'match': 'Larry', 'filth_type': 'name'}, ... ]), ... ]) >>> scrubber.clean("Anika is my favourite employee.") '{{NAME}} is my favourite employee.' This detector takes a list of dictonaires (reffered to as known filth items). These specify what to look for in the text to label as tagged filth. The dictionary should contain the following keys: * ``match`` (`str`) - a string value that will be searched for in the text * ``filth_type`` (`str`) - a string value that indicates the type of Filth, should be set to ``Filth.name``. An example of these could be 'name' or 'phone' for name and phone filths respectively. The known filth item dictionary may also optionally contain: * ``match_end`` (`str`) - if specified will search for Filth starting with the value of match and ending with the value of ``match_end`` * ``limit`` (`int`) - an integer describing the maximum number of characters between match and match_end, defaults to 150 * ``ignore_case`` (`bool`) - Ignore case when searching for the tagged filth * ``ignore_whitespace`` (`bool`) - Ignore whitespace when matching ("asd qwe" can also match "asd\\\\nqwe") * ``ignore_partial_word_matches`` (`bool`) - Ignore matches that are only partial words (if you're looking for "Eve", this flag ensure it wont match "Evening") Examples of this: * ``{'match': 'aaa', 'filth_type': 'name'}`` - will search for an exact match to aaa and return it as a ``NameFilth`` * ``{'match': 'aaa', 'match_end': 'zzz', 'filth_type': 'name'}`` - will search for `aaa` followed by up to 150 characters followed by `zzz`, which would match both `aaabbbzzz` and `aaazzz`. * ``{'match': '012345', 'filth_type': 'phone', 'ignore_partial_word_matches': True}`` - will search for an exact match to 012345, ignoring any partial matches and return it as a ``PhoneFilth`` This detector is not enabled by default (since you need to supply a list of known filths) and so you must always add it to your scrubber with a ``scrubber.add_detector(detector)`` call or by adding it to the ``detector_list`` inialising a ``Scrubber``. """ name = 'user_supplied'
[docs] def create_filth( self, start_location: int, end_location: int, text: str, comparison_type: Optional[str], detector_name: str, document_name: Optional[str], locale: str ) -> Filth: for item_name in dir(filth_module): try: filth_cls = filth_module.__getattribute__(item_name) except AttributeError: continue if not isinstance(filth_cls, type) or not issubclass(filth_cls, Filth): continue try: filth_type = filth_cls.type except AttributeError: continue if filth_type != comparison_type: continue return filth_cls( start_location, end_location, text, detector_name=detector_name, document_name=document_name, locale=locale, ) raise KeyError(f"Unable to find filth '{comparison_type}'")