Convert images to text on Android
Nutrient Android SDK supports extracting textual information from a scanned PDF. To do so, follow the steps below:
- First, convert the image into a PDF file as described in the image-to-PDF conversion guide.
- Next, perform OCR on the PDF file so that the textual information is extracted out of the PDF. This process is described in detail in the converting a scan into a searchable PDF guide.
- After performing OCR on the document, retrieve the text, text blocks, words, or glyphs from the pages. Here’s a detailed guide explaining how to do that.
This entire process is explained with the sample code below:
// Convert the image to a PDF file.val image: Bitmap = ...val outputFile: File = ... // Writable file.val imageSize = Size(image.width.toFloat(), image.height.toFloat())val pageImage = PageImage(image, PagePosition.CENTER).apply { setJpegQuality(70) }val newPage = NewPage.emptyPage(imageSize).withPageItem(pageImage).build()
val creationTask = PdfProcessorTask.newPage(newPage)val disposable = PdfProcessor.processDocumentAsync(creationTask, outputFile) .subscribe( { progress -> }, // onNext { throwable -> }, // onError { // Perform OCR on the file. val document = PdfDocumentLoader.openDocument(context, Uri.parse(outputFile.absolutePath)) val ocrTask = PdfProcessorTask .fromDocument(document) .performOcrOnPages((0 until document.pageCount).toSet(), OcrLanguage.ENGLISH)
val ocrFile: File = ... // Writable file.
ocrDisposable = PdfProcessor.processDocumentAsync(ocrTask, ocrFile) .subscribe( { progress -> }, // onNext { throwable -> }, // onError { // `onComplete` val ocrDocument = PdfDocumentLoader.openDocument(context, Uri.parse(ocrFile.absolutePath))
// Retrieve text from the document. val pageText = ocrDocument.getPageText(0) Log.d("PSPDFKit OCR", pageText) } ) } )
// Convert the image to a PDF file.final Bitmap image = ...final File outputFile = ... // Writable file.final Size imageSize = new Size(image.getWidth(), image.getHeight());final PageImage pageImage = new PageImage(image, PagePosition.CENTER);pageImage.setJpegQuality(70);final NewPage newPage = NewPage.emptyPage(imageSize).withPageItem(pageImage).build();
final PdfProcessorTask creationTask = PdfProcessorTask.newPage(newPage);final Disposable disposable = PdfProcessor.processDocumentAsync(creationTask, outputFile) .subscribe( progress -> { }, // onNext throwable -> { }, // onError () -> { // Perform OCR on the file. final PdfDocument document = PdfDocumentLoader .openDocument(context, Uri.parse(outputFile.getAbsolutePath())); final Set<Integer> pages = new HashSet(); for (int i = 0; i < document.getPageCount(); i++) { pages.add(i); }
final PdfProcessorTask ocrTask = PdfProcessorTask .fromDocument(document) .performOcrOnPages(pages, OcrLanguage.ENGLISH);
final File ocrFile = ... // Writable file.
ocrDisposable = PdfProcessor.processDocumentAsync(ocrTask, ocrFile) .subscribe( progress -> { }, // onNext throwable -> { }, // onError () -> { // `onComplete` final PdfDocument ocrDocument = PdfDocumentLoader .openDocument(context, Uri.parse(ocrFile.getAbsolutePath()));
// Retrieve text from the document. final String pageText = ocrDocument.getPageText(0); Log.d("PSPDFKit OCR", pageText); } ); } );