From 44f5052fdd5501e084f991e84579a7f8e9f94e0c Mon Sep 17 00:00:00 2001 From: fds-ssg Date: Thu, 7 Dec 2017 18:06:22 -0500 Subject: [PATCH] Get detailed character information for a pdf page Adds GetCharacterInformation function to PdfDocument which will retrieve detailed info about every character on a page, including: - Character: the actual char - FontSize - Bounds This info can be used for a variety of applications, including rendering visual boundaries on text, identifying text within certain regions, parsing page content into logical units based on position, such as words, tables, etc. --- PdfiumViewer.Demo/PdfRangeDocument.cs | 5 +++++ PdfiumViewer/IPdfDocument.cs | 7 +++++++ PdfiumViewer/NativeMethods.Pdfium.cs | 11 +++++++++++ PdfiumViewer/PdfCharacterInformation.cs | 26 +++++++++++++++++++++++++ PdfiumViewer/PdfDocument.cs | 10 ++++++++++ PdfiumViewer/PdfFile.cs | 18 +++++++++++++++++ PdfiumViewer/PdfiumViewer.csproj | 1 + 7 files changed, 78 insertions(+) create mode 100644 PdfiumViewer/PdfCharacterInformation.cs diff --git a/PdfiumViewer.Demo/PdfRangeDocument.cs b/PdfiumViewer.Demo/PdfRangeDocument.cs index c56f401..4cdf882 100644 --- a/PdfiumViewer.Demo/PdfRangeDocument.cs +++ b/PdfiumViewer.Demo/PdfRangeDocument.cs @@ -288,6 +288,11 @@ public Rectangle RectangleFromPdf(int page, RectangleF rect) return _document.RectangleFromPdf(TranslatePage(page), rect); } + public IList GetCharacterInformation(int page) + { + return _document.GetCharacterInformation(page); + } + private int TranslatePage(int page) { if (page < 0 || page >= PageCount) diff --git a/PdfiumViewer/IPdfDocument.cs b/PdfiumViewer/IPdfDocument.cs index 3baeebe..7c1652b 100644 --- a/PdfiumViewer/IPdfDocument.cs +++ b/PdfiumViewer/IPdfDocument.cs @@ -252,5 +252,12 @@ public interface IPdfDocument : IDisposable /// The rectangle to convert. /// The converted rectangle. Rectangle RectangleFromPdf(int page, RectangleF rect); + + /// + /// Get detailed information for all characters on the page. + /// + /// The page to get the information for. + /// The character information. + IList GetCharacterInformation(int page); } } diff --git a/PdfiumViewer/NativeMethods.Pdfium.cs b/PdfiumViewer/NativeMethods.Pdfium.cs index 33c245a..2a4c663 100644 --- a/PdfiumViewer/NativeMethods.Pdfium.cs +++ b/PdfiumViewer/NativeMethods.Pdfium.cs @@ -258,6 +258,14 @@ public static IntPtr FPDFText_FindStart(IntPtr page, byte[] findWhat, FPDF_SEARC } } + public static double FPDFText_GetFontSize(IntPtr page, int index) + { + lock (LockString) + { + return Imports.FPDFText_GetFontSize(page, index); + } + } + public static int FPDFText_GetSchResultIndex(IntPtr handle) { lock (LockString) @@ -684,6 +692,9 @@ private static class Imports [DllImport("pdfium.dll")] public static extern IntPtr FPDFText_FindStart(IntPtr page, byte[] findWhat, FPDF_SEARCH_FLAGS flags, int start_index); + [DllImport("pdfium.dll")] + public static extern double FPDFText_GetFontSize(IntPtr page, int index); + [DllImport("pdfium.dll")] public static extern int FPDFText_GetSchResultIndex(IntPtr handle); diff --git a/PdfiumViewer/PdfCharacterInformation.cs b/PdfiumViewer/PdfCharacterInformation.cs new file mode 100644 index 0000000..dba51fc --- /dev/null +++ b/PdfiumViewer/PdfCharacterInformation.cs @@ -0,0 +1,26 @@ +using System; +using System.Collections.Generic; +using System.Drawing; +using System.Text; + +namespace PdfiumViewer +{ + public struct PdfCharacterInformation + { + public int Page { get; } + public int Offset { get; } + public double FontSize { get; } + public char Character { get; } + public RectangleF Bounds { get; } + + public PdfCharacterInformation(int page, int offset, char character, double fontSize, RectangleF bounds) + { + Page = page; + Offset = offset; + FontSize = fontSize; + Bounds = bounds; + Character = character; + } + + } +} diff --git a/PdfiumViewer/PdfDocument.cs b/PdfiumViewer/PdfDocument.cs index 9541b49..04c777c 100644 --- a/PdfiumViewer/PdfDocument.cs +++ b/PdfiumViewer/PdfDocument.cs @@ -598,5 +598,15 @@ protected void Dispose(bool disposing) _disposed = true; } } + + /// + /// Get detailed information all characters on the page. + /// + /// The page to get the information for. + /// The character information. + public IList GetCharacterInformation(int page) + { + return _file.GetCharacterInformation(page); + } } } diff --git a/PdfiumViewer/PdfFile.cs b/PdfiumViewer/PdfFile.cs index 298e819..fa93f3c 100644 --- a/PdfiumViewer/PdfFile.cs +++ b/PdfiumViewer/PdfFile.cs @@ -497,6 +497,24 @@ private string GetPdfText(PageData pageData, PdfTextSpan textSpan) return FPDFEncoding.GetString(result, 0, textSpan.Length * 2); } + public IList GetCharacterInformation(int page) + { + using (var pageData = new PageData(_document, _form, page)) + { + var result = new List(); + int charCount = NativeMethods.FPDFText_CountChars(pageData.TextPage); + var allChars = GetPdfText(pageData, new PdfTextSpan(page, 0, charCount)).ToCharArray(); + + for (int i = 0; i < charCount; i++) + { + var bounds = GetBounds(pageData.TextPage, i); + double fontSize = NativeMethods.FPDFText_GetFontSize(pageData.TextPage, i); + result.Add(new PdfCharacterInformation(page, i, allChars[i], fontSize, bounds)); + } + + return result; + } + } public void DeletePage (int pageNumber) { NativeMethods.FPDFPage_Delete(_document, pageNumber); diff --git a/PdfiumViewer/PdfiumViewer.csproj b/PdfiumViewer/PdfiumViewer.csproj index 24c8e9c..ab5e472 100644 --- a/PdfiumViewer/PdfiumViewer.csproj +++ b/PdfiumViewer/PdfiumViewer.csproj @@ -71,6 +71,7 @@ +