GetPageTextWithCoords(String,TextExtractionOutputInfo) Method
Returns various information about extracted text, regardless if visible or hidden, on the current page of the loaded PDF document such as the bounding box coordinates, the font information, the text mode and the text size, glyph widths and glyph character representations. The extracted text from the current page is divided by words. This method allows to include and exclude each information to better suit the use case. Each word together with its text and font properties completed with widths of single characters is recorded in one separated line. The space character (between the words in text) is also considered as a word. Two or more spaces in a row are considered as one word. The resulting string for one word is formatted this way if all flags are set:
the horizontal (X) coordinate of the top left point of the rendering area + [FieldSeparator] +
the vertical (Y) coordinate of the top left point of the rendering area + [FieldSeparator] +
the horizontal (X) coordinate of the top right point of the rendering area + [FieldSeparator] +
the vertical (Y) coordinate of the top right point of the rendering area + [FieldSeparator] +
the horizontal (X) coordinate of the bottom right point of the rendering area + [FieldSeparator] +
the vertical (Y) coordinate of the bottom right point of the rendering area + [FieldSeparator] +
the horizontal (X) coordinate of the bottom left point of the rendering area + [FieldSeparator] +
the vertical (Y) coordinate of the bottom left point of the rendering area + [FieldSeparator] +
extracted word + [FieldSeparator] +
font name + [FieldSeparator] +
font box height + [FieldSeparator] +
text mode + [FieldSeparator] +
text size + [FieldSeparator] +
array of widths for each single glyph of the extracted word delimited by the [FieldSeparator] +
array of character representations for each single glyph of the extracted word delimited by the [FieldSeparator] + EOL
The rendering area means the rectangle area on the page where the extracted word is really situated (rendered). You can use the provided coordinates to easily calculate the dimensions of this area, the coordinates of the single characters and the text rotation angle. You can also benefit from using the GuessPageTextRotation method if the presented text is rotated in various angles on the current page.
The result for the current page should contain exactly the same number of lines as is the count of all words including the space-words in the text on that page.
Parameters
- FieldSeparator
- The string that is used to delimit the above enumerated fields in the resulting text.
- Options
- A bitwise combination of values of the TextExtractionOutputInfo enumeration. Specifies which information should be provided in the output.
Return Value
The whole page text divided by one word per text line including the text coordinates and its properties in the above described format. The
GetStat method can be subsequently used to determine if this method has been successful.
How to find out and calculate the dimensions of the rendering area for the first word and for the single characters.
Dim caption As String = "Example: GetPageTextWithCoords"
Using gdpicturePDF As GdPicturePDF = New GdPicturePDF()
If gdpicturePDF.LoadFromFile("test.pdf", False) = GdPictureStatus.OK Then
Dim text As String = gdpicturePDF.GetPageTextWithCoords(" ", TextExtractionOutputInfo.IncludeWordBounds Or
TextExtractionOutputInfo.IncludeWordString Or
TextExtractionOutputInfo.IncludeFontName Or
TextExtractionOutputInfo.IncludeFontBoxHeight Or
TextExtractionOutputInfo.IncludeTextMode Or
TextExtractionOutputInfo.IncludeTextSize Or
TextExtractionOutputInfo.IncludeGlyphWidths)
If gdpicturePDF.GetStat() = GdPictureStatus.OK Then
'Considering only the first word as an example.
'Getting the first line with properties of the first word.
Dim index As Integer = text.IndexOf(vbCrLf)
Dim firstLine As String = text.Substring(0, index)
Dim coord As String() = firstLine.Split("~")
'Calculating the vector to determine the height of the rendering area for the first word.
Dim vectorXH As Double = Double.Parse(coord(0)) - Double.Parse(coord(6))
Dim vectorYH As Double = Double.Parse(coord(7)) - Double.Parse(coord(1))
'Calculating the height of the area.
Dim boxHeight As Double = Math.Sqrt(vectorXH * vectorXH + vectorYH * vectorYH)
'Calculating the vector to determine the width of the rendering area for the first word.
Dim vectorXW As Double = Double.Parse(coord(6)) - Double.Parse(coord(4))
Dim vectorYW As Double = Double.Parse(coord(7)) - Double.Parse(coord(5))
'Calculating the width of the area.
Dim boxWidth As Double = Math.Sqrt(vectorXW * vectorXW + vectorYW * vectorYW)
'Getting the first extracted word.
Dim word As String = coord(8)
Dim wordWidth As Double = 0
'13 is the position of the width of first character in the extracted word
For i As Integer = 13 To (13 + word.Length) - 1
'Getting the widths of all single characters in the first word.
wordWidth += Double.Parse(coord(i))
Next
'Calculating the word rotation angle.
Dim angle As Double = Math.Atan2(vectorXH, vectorYH) * (180 / Math.PI)
'Be aware that the resulting angle is relative to the chosen base axis.
'wordWidth should be the same as the boxWidth
If Math.Round(wordWidth, 2) = Math.Round(boxWidth, 2) Then
MessageBox.Show("The example is followed successfully.", caption)
Else
MessageBox.Show("Something is wrong:" + vbCrLf + " wordWidth = " + Math.Round(wordWidth, 2).ToString() + vbCrLf + " boxWidth = " + Math.Round(boxWidth, 2).ToString(), caption)
End If
'Continue...
Else
MessageBox.Show("The GetPageTextWithCoords() method has failed with the status: " + gdpicturePDF.GetStat().ToString(), caption)
End If
Else
MessageBox.Show("The file can't be loaded.", caption)
End If
End Using
string caption = "Example: GetPageTextWithCoords";
using (GdPicturePDF gdpicturePDF = new GdPicturePDF())
{
if (gdpicturePDF.LoadFromFile("test.pdf", false) == GdPictureStatus.OK)
{
string text = gdpicturePDF.GetPageTextWithCoords("~", TextExtractionOutputInfo.IncludeWordBounds |
TextExtractionOutputInfo.IncludeWordString
TextExtractionOutputInfo.IncludeFontName
TextExtractionOutputInfo.IncludeFontBoxHeight
TextExtractionOutputInfo.IncludeTextMode
TextExtractionOutputInfo.IncludeTextSize
TextExtractionOutputInfo.IncludeGlyphWidths);
if (gdpicturePDF.GetStat() == GdPictureStatus.OK)
{
//Considering only the first word as an example.
//Getting the first line with properties of the first word.
int index = text.IndexOf("\r\n");
string firstLine = text.Substring(0, index);
string[] coord = firstLine.Split('~');
//Calculating the vector to determine the height of the rendering area for the first word.
double vectorXH = double.Parse(coord[0]) - double.Parse(coord[6]);
double vectorYH = double.Parse(coord[7]) - double.Parse(coord[1]);
//Calculating the height of the area.
double boxHeight = Math.Sqrt(vectorXH * vectorXH + vectorYH * vectorYH);
//Calculating the vector to determine the width of the rendering area for the first word.
double vectorXW = double.Parse(coord[6]) - double.Parse(coord[4]);
double vectorYW = double.Parse(coord[7]) - double.Parse(coord[5]);
//Calculating the width of the area.
double boxWidth = Math.Sqrt(vectorXW * vectorXW + vectorYW * vectorYW);
//Getting the first extracted word.
string word = coord[8];
double wordWidth = 0;
//13 is the position of the width of first character in the extracted word
for (int i = 13; i < (13 + word.Length); i++)
{
//Getting the widths of all single characters in the first word.
wordWidth += double.Parse(coord[i]);
}
//Calculating the word rotation angle.
double angle = Math.Atan2(vectorXH, vectorYH) * (180 / Math.PI);
//Be aware that the resulting angle is relative to the chosen base axis.
//wordWidth should be the same as the boxWidth
if (Math.Round(wordWidth, 2) == Math.Round(boxWidth, 2))
MessageBox.Show("The example is followed successfully.", caption);
else
MessageBox.Show("Something is wrong:\n wordWidth = " + Math.Round(wordWidth, 2).ToString() + "\n boxWidth = " + Math.Round(boxWidth, 2).ToString(), caption);
//Continue...
}
else
{
MessageBox.Show("The GetPageTextWithCoords() method has failed with the status: " + gdpicturePDF.GetStat().ToString(), caption);
}
}
else
{
MessageBox.Show("The file can't be loaded.", caption);
}
}