Source code for qfa.adapters.presidio_anonymizer
"""Presidio-based anonymisation adapter.
Implements ``AnonymizationPort`` by delegating to Microsoft Presidio's
analyzer and anonymizer engines. Owns the heavy spaCy-backed pipelines
so the application service layer never imports Presidio directly.
"""
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine, OperatorConfig
from qfa.domain.ports import AnonymizationPort
[docs]
class PresidioAnonymizer(AnonymizationPort):
"""``AnonymizationPort`` implementation backed by Presidio.
Replaces detected entities with stable placeholders of the form
``<ENTITY_TYPE_N>`` (e.g. ``<PERSON_0>``, ``<LOCATION_1>``) so the
same value gets the same placeholder within a single ``anonymize``
call. ``DATE_TIME`` entities are preserved verbatim — they carry
relevant context for analysis without identifying individuals.
"""
def __init__(self) -> None:
self._analyzer: AnalyzerEngine = AnalyzerEngine()
self._anonymizer: AnonymizerEngine = AnonymizerEngine()
[docs]
def anonymize(self, text: str) -> tuple[str, dict[str, str]]:
"""Replace sensitive entities in ``text`` with placeholders."""
mapping: dict[str, str] = {}
results = self._analyzer.analyze(text=text, language="en")
unique_entities = {res.entity_type for res in results}
operators: dict[str, OperatorConfig] = {}
for entity in unique_entities:
operators[entity] = OperatorConfig(
"custom",
{
# Capture 'entity' as a default argument 'ent' to avoid closure issues
"lambda": lambda x, ent=entity: self._get_unique_id(x, ent, mapping)
},
)
# Preserve DATE_TIME entities without anonymisation.
operators["DATE_TIME"] = OperatorConfig("keep")
anonymized = self._anonymizer.anonymize(
text=text,
analyzer_results=results, # type: ignore[ty:invalid-argument-type]
operators=operators,
)
return anonymized.text, mapping
[docs]
def deanonymize(self, text: str, mapping: dict[str, str]) -> str:
"""Restore original values in ``text`` using ``mapping``."""
for placeholder, original in mapping.items():
text = text.replace(placeholder, original)
return text
@staticmethod
def _get_unique_id(
original_value: str, entity_type: str, mapping: dict[str, str]
) -> str:
"""Return a stable placeholder for ``original_value`` within ``mapping``."""
if original_value == "PII":
return "<PII>"
for placeholder, value in mapping.items():
if value == original_value and placeholder.startswith(f"<{entity_type}_"):
return placeholder
placeholder = f"<{entity_type}_{len(mapping.keys())}>"
mapping[placeholder] = original_value
return placeholder