Source code for scrubadub


from typing import Union, List, Dict, Sequence, Optional

# convenient imports
from .scrubbers import Scrubber
from . import filth
from . import detectors
from . import post_processors
from .filth import Filth

__version__ = VERSION = "2.0.0"
__all__ = [
    'Scrubber', 'filth', 'detectors', 'post_processors', 'clean', 'clean_documents', 'list_filth',
    'list_filth_documents',
]


[docs]def clean(text: str, locale: Optional[str] = None, **kwargs) -> str: """Seaches for ``Filth`` in `text` in a string and replaces it with placeholders. .. code:: pycon >>> import scrubadub >>> scrubadub.clean(u"contact me at joe@example.com") 'contact me at {{EMAIL}}' :param text: The text containing possible PII that needs to be redacted :type text: `str` :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an underscore and the two letter upper-case country code, eg "en_GB" or "de_CH" :type locale: str :return: Text with all :class:``Filth`` replaced. :rtype: `str` """ scrubber = Scrubber(locale=locale) return scrubber.clean(text, **kwargs)
[docs]def clean_documents(documents: Union[Sequence[str], Dict[Optional[str], str]], locale: Optional[str] = None, **kwargs ) -> Union[Sequence[str], Dict[Optional[str], str]]: """Seaches for ``Filth`` in `documents` and replaces it with placeholders. `documents` can be in a dict, in the format of ``{'document_name': 'document'}``, or as a list of strings (each a seperate document). This can be useful when processing many documents. .. code:: pycon >>> import scrubadub >>> scrubadub.clean_documents({'contact.txt': "contact me at joe@example.com", ... 'hello.txt': 'hello world!'}) {'contact.txt': 'contact me at {{EMAIL}}', 'hello.txt': 'hello world!'} >>> scrubadub.clean_documents(["contact me at joe@example.com", 'hello world!']) ['contact me at {{EMAIL}}', 'hello world!'] :param documents: Documents containing possible PII that needs to be redacted in the form of a list of documents or a dictonary with the key as the document name and the value as the document text :type documents: `list` of `str` objects, `dict` of `str` objects :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an underscore and the two letter upper-case country code, eg "en_GB" or "de_CH" :type locale: str :return: Documents in the same format as input, but with `Filth` redacted :rtype: `list` of `str` objects, `dict` of `str` objects; same as input """ scrubber = Scrubber(locale=locale) return scrubber.clean_documents(documents, **kwargs)
[docs]def list_filth(text: str, locale: Optional[str] = None, **kwargs) -> List[Filth]: """Return a list of ``Filth`` that was detected in the string `text`. .. code:: pycon >>> import scrubadub >>> scrubadub.list_filth(u"contact me at joe@example.com") [<EmailFilth text='joe@example.com' beg=14 end=29 detector_name='email' locale='en_US'>] :param text: The text containing possible PII that needs to be found :type text: `str` :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an underscore and the two letter upper-case country code, eg "en_GB" or "de_CH" :type locale: str :return: A list of all the :class:``Filth`` objects that were found :rtype: `list` of :class:``Filth`` objects """ scrubber = Scrubber(locale=locale) return list(scrubber.iter_filth(text, **kwargs))
[docs]def list_filth_documents(documents: Union[List[str], Dict[Optional[str], str]], locale: Optional[str] = None, **kwargs) -> List[Filth]: """Return a list of ``Filth`` that was detected in the string `text`. `documents` can be in a dict, in the format of ``{'document_name': 'document'}``, or as a list of strings (each a seperate document). This can be useful when processing many documents. .. code:: pycon >>> import scrubadub >>> scrubadub.list_filth_documents( ... {'contact.txt': "contact me at joe@example.com", 'hello.txt': 'hello world!'} ... ) [<EmailFilth text='joe@example.com' document_name='contact.txt' beg=14 end=29 detector_name='email' \ locale='en_US'>] >>> scrubadub.list_filth_documents(["contact me at joe@example.com", 'hello world!']) [<EmailFilth text='joe@example.com' document_name='0' beg=14 end=29 detector_name='email' locale='en_US'>] :param documents: Documents containing possible PII that needs to be found in the form of a list of documents or a dictonary with the key as the document name and the value as the document text :type documents: `list` of `str` objects, `dict` of `str` objects :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an underscore and the two letter upper-case country code, eg "en_GB" or "de_CH" :type locale: str :return: A list of all the :class:``Filth`` objects that were found :rtype: `list` of :class:``Filth`` objects """ scrubber = Scrubber(locale=locale) return list(scrubber.iter_filth_documents(documents, **kwargs))