Do You Have Sample Code for Text Extraction?

Document Converter Services version 11.0 features text extraction that enables data extraction in text-based PDF files. Here’s sample code demonstrating this:

static void Main(string[] args)
{
    DocumentConverterServiceClient client = null;

    try
    {
        // ** Delete any processed files from a previous run.
        foreach (FileInfo f in new DirectoryInfo(".").GetFiles("*.txt"))
            f.Delete();

        // ** Determine the source file and read it into a byte array.
        string sourceFileName = null;
        if (args.Length == 0)
        {
            // ** If nothing is specified, then read the first PDF file from the current folder.
            string[] sourceFiles = Directory.GetFiles(Directory.GetCurrentDirectory(), "*.pdf");
            if (sourceFiles.Length > 0)
                sourceFileName = sourceFiles[0];
            else
            {
                Console.WriteLine("Please specify a document to extract text from.");
                Console.ReadKey();
                return;
            }
        }
        else
            sourceFileName = args[0];

        // ** Determine the source file and read it into a byte array.
        byte[] sourceFile = File.ReadAllBytes(sourceFileName);

        // ** Open the service and configure the bindings.
        client = OpenService(SERVICE_URL);

        //** Set the absolute minimum open options.
        OpenOptions openOptions = new OpenOptions();
        openOptions.OriginalFileName = Path.GetFileName(sourceFileName);
        openOptions.FileExtension = Path.GetExtension(sourceFileName);

        TextExtractSettings textExtractSettings = new TextExtractSettings();
        textExtractSettings.PageRange = "*"; // All pages.

        // ** Carry out the extraction.
        byte[] convFile = client.ExtractText(sourceFile, openOptions, textExtractSettings);

        // ** Write the converted file back to the file system with a TXT extension.
        string destinationFileName = Path.GetFileNameWithoutExtension(sourceFileName) + ".txt";
        using (FileStream fs = File.Create(destinationFileName))
        {
            fs.Write(convFile, 0, convFile.Length);
            fs.Close();
        }
        Console.WriteLine("Text extracted to " + Path.GetFullPath(destinationFileName));

        // ** Open the generated file in a text file reader.
        Console.WriteLine("Launching file in reader");
        Process.Start(destinationFileName);
    }
    catch (FaultException<WebServiceFaultException> ex)
    {
        Console.WriteLine("FaultException occurred: ExceptionType: " +
                            ex.Detail.ExceptionType.ToString());
    }
    catch (Exception ex)
    {
        Console.WriteLine(ex.ToString());
    }
    finally
    {
        CloseService(client);
    }
    Console.ReadKey();
}

/// <summary>
/// Configure the bindings and endpoints and open the service using the specified address.
/// </summary>
/// <returns>An instance of the web service.</returns>
public static DocumentConverterServiceClient OpenService(string address)
{
    DocumentConverterServiceClient client = null;

    try
    {
        BasicHttpBinding binding = new BasicHttpBinding();
        // ** Use standard Windows Security.
        binding.Security.Mode = BasicHttpSecurityMode.TransportCredentialOnly;
        binding.Security.Transport.ClientCredentialType =
                                                        HttpClientCredentialType.Windows;
        // ** Increase the client timeout to deal with (very) long running requests.
        binding.SendTimeout = TimeSpan.FromMinutes(120);
        binding.ReceiveTimeout = TimeSpan.FromMinutes(120);
        // ** Set the maximum document size to 50MB
        binding.MaxReceivedMessageSize = 50 * 1024 * 1024;
        binding.ReaderQuotas.MaxArrayLength = 50 * 1024 * 1024;
        binding.ReaderQuotas.MaxStringContentLength = 50 * 1024 * 1024;

        // ** Specify an identity (any identity) to get past .net3.5 sp1.
        EndpointIdentity epi = EndpointIdentity.CreateUpnIdentity("unknown");
        EndpointAddress epa = new EndpointAddress(new Uri(address), epi);

        client = new DocumentConverterServiceClient(binding, epa);

        client.Open();

        return client;
    }
    catch (Exception)
    {
        CloseService(client);
        throw;
    }
}

/// <summary>
/// Check if the client is open and then close it.
/// </summary>
/// <param name="client">The client to close</param>
public static void CloseService(DocumentConverterServiceClient client)
{
    if (client != null && client.State == CommunicationState.Opened)
        client.Close();
}