Using RegEx to redact PDFs on Android
Nutrient Android SDK enables you to redact text in a PDF document using regular expression patterns. This guide shows how do this using the existing text extraction and redaction APIs.
In this example, you’re redacting URLs from a document. First, you need to create the regular expression with the URL pattern:
val urlPattern = """[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&/=]*)""" val urlRegularExpression = Regex(urlPattern)
final String urlPattern = "[-a-zA-Z0-9@:%._+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b([-a-zA-Z0-9()@:%_+.~#?&/=]*)"; final Regex urlRegularExpression = new Regex(urlPattern);
Then, you’ll have to loop through all the pages in the document to find and mark the matching URLs for redaction:
val fileUri = ... // File Uri val document: PdfDocument = PdfDocumentLoader.openDocument(context, fileUri) val pageCount = document.pageCount for (i in 0 until pageCount) { val text = document.getPageText(i) val matchResult: MatchResult = urlRegularExpression.matchEntire(text) ?: continue for (matchGroup in matchResult.groups.filterNotNull()) { val range = matchGroup.component2() val startIndex: Int = range.first val length: Int = startIndex + range.last val textRects = document.getPageTextRects(i, startIndex, length) val redaction = RedactionAnnotation(i, textRects).apply { color = Color.BLACK fillColor = Color.BLACK outlineColor = Color.YELLOW overlayText = "REDACTED" } document.annotationProvider.addAnnotationToPage(redaction) } }
final Uri fileUri = ... // File Uri final PdfDocument document = PdfDocumentLoader.openDocument(context, fileUri); final int pageCount = document.getPageCount(); for (int i = 0; i < pageCount; i++) { final String text = document.getPageText(i); final MatchResult matchResult = urlRegularExpression.matchEntire(text); if (matchResult == null) continue; for(MatchGroup matchGroup : matchResult.getGroups()) { final IntRange range = matchGroup.component2(); final int startIndex = range.getStart(); final int length = startIndex + range.getEndInclusive(); final List<RectF> textRects = document.getPageTextRects(i, startIndex, length); final RedactionAnnotation redaction = new RedactionAnnotation(i, textRects); redaction.setColor(Color.BLACK); redaction.setFillColor(Color.BLACK); redaction.setOutlineColor(Color.YELLOW); redaction.setOverlayText("REDACTED"); document.getAnnotationProvider().addAnnotationToPage(redaction); } }
Finally, create a new PDF file for the redacted document by applying redactions.
You can do so using the PdfProcessor
API:
val outputFile: File val document: PdfDocument val processorTask = PdfProcessorTask.fromDocument(document) .applyRedactions() PdfProcessor.processDocument(processorTask, outputFile) val redactedDocument = PdfDocumentLoader.openDocument(context, Uri.fromFile(outputFile))
final File outputFile; final PdfDocument document; final PdfProcessorTask processorTask = PdfProcessorTask.fromDocument(document) .applyRedactions(); PdfProcessor.processDocument(processorTask, outputFile); final PdfDocument redactedDocument = PdfDocumentLoader.openDocument(context, Uri.fromFile(outputFile));
Or, you can use the DocumentSaveOptions#setApplyRedactions()
option when saving the document via any of the save methods on PdfDocument
.
Bear in mind that this will overwrite the existing document, removing content irreversibly:
val documentSaveOptions = DocumentSaveOptions(null, null, true, null); documentSaveOptions.setApplyRedactions(true) document.saveIfModified(documentSaveOptions)
final DocumentSaveOptions documentSaveOptions = new DocumentSaveOptions(null, null, false, null); documentSaveOptions.setApplyRedactions(true); document.saveIfModified(documentSaveOptions);