Using RegEx to redact PDFs on Android

Nutrient Android SDK enables you to redact text in a PDF document using regular expression patterns. This guide shows how do this using the existing text extraction and redaction APIs.

In this example, you’re redacting URLs from a document. First, you need to create the regular expression with the URL pattern:

val urlPattern = """[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&/=]*)"""
val urlRegularExpression = Regex(urlPattern)
final String urlPattern = "[-a-zA-Z0-9@:%._+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b([-a-zA-Z0-9()@:%_+.~#?&/=]*)";
final Regex urlRegularExpression = new Regex(urlPattern);

Then, you’ll have to loop through all the pages in the document to find and mark the matching URLs for redaction:

val fileUri = ... // File Uri
val document: PdfDocument = PdfDocumentLoader.openDocument(context, fileUri)
val pageCount = document.pageCount

for (i in 0 until pageCount) {
    val text = document.getPageText(i)
    val matchResult: MatchResult = urlRegularExpression.matchEntire(text) ?: continue
    for (matchGroup in matchResult.groups.filterNotNull()) {
        val range = matchGroup.component2()
        val startIndex: Int = range.first
        val length: Int = startIndex + range.last
        val textRects = document.getPageTextRects(i, startIndex, length)
        val redaction = RedactionAnnotation(i, textRects).apply {
            color = Color.BLACK
            fillColor = Color.BLACK
            outlineColor = Color.YELLOW
            overlayText = "REDACTED"
        }
        document.annotationProvider.addAnnotationToPage(redaction)
    }
}
final Uri fileUri = ... // File Uri
final PdfDocument document = PdfDocumentLoader.openDocument(context, fileUri);
final int pageCount = document.getPageCount();

for (int i = 0; i < pageCount; i++) {

    final String text = document.getPageText(i);
    final MatchResult matchResult = urlRegularExpression.matchEntire(text);
    if (matchResult == null)
        continue;

    for(MatchGroup matchGroup : matchResult.getGroups()) {
        final IntRange range = matchGroup.component2();

        final int startIndex = range.getStart();
        final int length = startIndex + range.getEndInclusive();

        final List<RectF> textRects = document.getPageTextRects(i, startIndex, length);
        final RedactionAnnotation redaction = new RedactionAnnotation(i, textRects);
        redaction.setColor(Color.BLACK);
        redaction.setFillColor(Color.BLACK);
        redaction.setOutlineColor(Color.YELLOW);
        redaction.setOverlayText("REDACTED");

        document.getAnnotationProvider().addAnnotationToPage(redaction);
    }
}

Finally, create a new PDF file for the redacted document by applying redactions.

You can do so using the PdfProcessor API:

val outputFile: File
val document: PdfDocument
val processorTask = PdfProcessorTask.fromDocument(document)
    .applyRedactions()

PdfProcessor.processDocument(processorTask, outputFile)
val redactedDocument = PdfDocumentLoader.openDocument(context, Uri.fromFile(outputFile))
final File outputFile;
final PdfDocument document;
final PdfProcessorTask processorTask = PdfProcessorTask.fromDocument(document)
    .applyRedactions();

PdfProcessor.processDocument(processorTask, outputFile);
final PdfDocument redactedDocument = PdfDocumentLoader.openDocument(context, Uri.fromFile(outputFile));

Or, you can use the DocumentSaveOptions#setApplyRedactions() option when saving the document via any of the save methods on PdfDocument.

Bear in mind that this will overwrite the existing document, removing content irreversibly:

val documentSaveOptions = DocumentSaveOptions(null, null, true, null);
documentSaveOptions.setApplyRedactions(true)
document.saveIfModified(documentSaveOptions)
final DocumentSaveOptions documentSaveOptions = new DocumentSaveOptions(null, null, false, null);
documentSaveOptions.setApplyRedactions(true);
document.saveIfModified(documentSaveOptions);