Source code for scrubadub.filth.base

import warnings
from faker import Faker
from typing import Optional, ClassVar, Pattern, List, Match

from .. import exceptions
from .. import utils

[docs]class Filth(object): """This is the base class for all ``Filth`` that is detected in dirty dirty text. """ # this allows people to customize the output, especially for placeholder # text and identifier replacements prefix = u'{{' # type: ClassVar[str] suffix = u'}}' # type: ClassVar[str] # the `type` is used when filths are merged to come up with a sane label type = 'unknown' # type: ClassVar[str] # the `lookup` is used to keep track of all of the different types of filth # that are encountered across all `Filth` types. lookup = utils.Lookup() # For backwards compatibility, but this is deprecated. regex = None # type: Optional[Pattern[str]]
[docs] def __init__(self, beg: Optional[int] = None, end: Optional[int] = None, text: Optional[str] = None, match: Optional[Match] = None, detector_name: Optional[str] = None, document_name: Optional[str] = None, replacement_string: Optional[str] = None, locale: Optional[str] = None, **kwargs): self.beg = 0 # type: int self.end = 0 # type: int self.text = '' # type: str self.match = None # type: Optional[Match] if match is not None and isinstance(match, Match): self.beg = match.start() self.end = match.end() self.text = match.string[match.start():match.end()] self.match = match if beg is not None: self.beg = beg if end is not None: self.end = end if text is not None: self.text = text self.detector_name = detector_name # type: Optional[str] self.document_name = document_name # type: Optional[str] self.replacement_string = replacement_string # type: Optional[str] self.locale = locale # type: Optional[str] if self.beg >= self.end: raise ValueError( f"Creating invalid filth (self.beg >= self.end): {self}" )
@property def placeholder(self) -> str: return self.type.upper() @property def identifier(self) -> str: # NOTE: this is not an efficient way to store this in memory. could # alternatively hash the type and text and do away with the overhead # bits of storing the tuple in the lookup i = self.lookup[(self.type, self.text.lower())] return u'%s-%d' % (self.placeholder, i)
[docs] def replace_with(self, replace_with: str = 'placeholder', **kwargs) -> str: warnings.warn( "Filth.replace_with() will be removed in favour of using the more general PostProcessors", DeprecationWarning ) if self.prefix != '{{' or self.suffix != '}}': warnings.warn( "Setting prefixes and suffixes with scrubadub.filth.Filth.prefix or scrubadub.filth.Filth.suffix " "is depreciated in favour of using the PrefixSuffixReplacer", DeprecationWarning ) if replace_with == 'placeholder': return self.prefix + self.placeholder + self.suffix # elif replace_with == 'surrogate': # raise NotImplementedError elif replace_with == 'identifier': return self.prefix + self.identifier + self.suffix else: raise exceptions.InvalidReplaceWith(replace_with)
[docs] def merge(self, other_filth: 'Filth') -> 'MergedFilth': return MergedFilth(self, other_filth)
def __repr__(self) -> str: return self._to_string() def _to_string(self, attributes: Optional[List[str]] = None) -> str: if attributes is None: attributes = ['text', 'document_name', 'beg', 'end', 'comparison_type', 'detector_name', 'locale'] item_attributes = [ "{}={}".format(item, getattr(self, item, None).__repr__()) for item in attributes if getattr(self, item, None) is not None ] return "<{} {}>".format(self.__class__.__name__, " ".join(item_attributes)) def __eq__(self, other) -> bool: """Only test equality on a subset of class attributes and some are optional""" match = True if not hasattr(other, 'beg') or not hasattr(other, 'end') or not hasattr(other, 'text'): raise TypeError("Unsupported comparison with a Filth and {}".format(type(other))) match &= (self.beg == other.beg) match &= (self.end == other.end) match &= (self.text == other.text) if hasattr(self, 'document_name') or hasattr(other, 'document_name'): match &= (self.document_name == other.document_name) if hasattr(self, 'detector_name') or hasattr(other, 'detector_name'): match &= (self.detector_name == other.detector_name) return match
[docs] @staticmethod def generate(faker: Faker) -> str: """Generates an example of this ``Filth`` type, usually using the faker python library. :param faker: The ``Faker`` class from the ``faker`` library :type faker: Faker :return: An example of this ``Filth`` :rtype: str """ raise NotImplementedError("A generate() function has not been implemented for this Filth")
[docs] def is_valid(self) -> bool: return True
class MergedFilth(Filth): """This class takes care of merging different types of filth""" def __init__(self, a_filth: Filth, b_filth: Filth): super(MergedFilth, self).__init__( beg=a_filth.beg, end=a_filth.end, text=a_filth.text, document_name=a_filth.document_name, ) self.filths = [a_filth] self._update_content(b_filth) def _update_content(self, other_filth: Filth): """this updates the bounds, text and placeholder for the merged filth """ if self.end < other_filth.beg or other_filth.end < self.beg: raise exceptions.FilthMergeError( "a_filth goes from [%s, %s) and b_filth goes from [%s, %s)" % ( self.beg, self.end, other_filth.beg, other_filth.end )) if self.document_name != other_filth.document_name: raise exceptions.FilthMergeError( "This MergedFilth is in document {}, but the Filth that is being merged is in another document {}" "".format(self.document_name.__repr__(), other_filth.document_name.__repr__()) ) # get the text over lap correct if self.beg < other_filth.beg: first = self # type: Filth second = other_filth # type: Filth else: second = self first = other_filth end_offset = second.end - first.end if end_offset > 0: self.text = first.text + second.text[-end_offset:] # update the beg/end strings self.beg = min(self.beg, other_filth.beg) self.end = max(self.end, other_filth.end) if self.end - self.beg != len(self.text): raise exceptions.FilthMergeError("text length isn't consistent") # update the placeholder self.filths.append(other_filth) self._placeholder = '+'.join([filth.type for filth in self.filths]) @property def placeholder(self): return self._placeholder.upper() def merge(self, other_filth: Filth) -> 'MergedFilth': """Be smart about merging filth in this case to avoid nesting merged filths. """ self._update_content(other_filth) return self def __repr__(self) -> str: return self._to_string(['filths']) class RegexFilth(Filth): def __init__(self, *args, **kwargs): warnings.warn("Use of RegexFilth is depreciated, use Filth directly instead.", DeprecationWarning) super(RegexFilth, self).__init__(*args, **kwargs)