OCR Processor Features
17 Sep 202424 minutes to read
Performing OCR for an entire document
To perform OCR for an entire PDF document using PerformOCR method of the OCRProcessor class. Refer to the following code example.
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
//Load an existing PDF document.
FileStream stream = new FileStream("Input.pdf", FileMode.Open);
PdfLoadedDocument document = new PdfLoadedDocument(stream);
//Set OCR language.
processor.Settings.Language = Languages.English;
//Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document);
//Create file stream.
using (FileStream outputFileStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite))
{
//Save the PDF document to file stream.
document.Save(outputFileStream);
}
//Close the document.
document.Close(true);
}
'Initialize the OCR processor.
Using processor As OCRProcessor = New OCRProcessor()
'Load an existing PDF document.
Dim stream As FileStream = New FileStream("Input.pdf", FileMode.Open)
Dim document As PdfLoadedDocument = New PdfLoadedDocument(stream)
'Set OCR language.
processor.Settings.Language = Languages.English
'Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document)
'Create file stream.
Using outputFileStream As FileStream = New FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite)
'Save the PDF document to file stream.
document.Save(outputFileStream)
End Using
'Close the document
document.Close(True)
End Using
NOTE
The PerformOCR method returns only the text OCRed by OCRProcessor. Other existing text in the PDF page will not be returned in this method. Please check text extraction feature for this.
You can downloaded a complete working sample from GitHub.
Performing OCR for a region of the document
To perform OCR on a particular region or several regions of a PDF page with the help of PageRegion class in OCRSettings, refer to the following code sample.
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
//Load an existing PDF document.
FileStream stream = new FileStream("Input.pdf", FileMode.Open);
PdfLoadedDocument document = new PdfLoadedDocument(stream);
//Set OCR language.
processor.Settings.Language = Languages.English;
//Assign rectangles to the page.
RectangleF rect = new RectangleF(0, 100, 950, 150);
List<PageRegion> pageRegions = new List<PageRegion>();
//Create page region.
PageRegion region = new PageRegion();
//Set page index.
region.PageIndex = 0;
//Set page region.
region.PageRegions = new RectangleF[] { rect };
//Add region to page region.
pageRegions.Add(region);
//Set page regions.
processor.Settings.Regions = pageRegions;
//Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document);
//Create file stream.
using (FileStream outputFileStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite))
{
//Save the PDF document to file stream.
document.Save(outputFileStream);
}
//Close the document.
document.Close(true);
}
'Initialize the OCR processor.
Using processor As OCRProcessor = New OCRProcessor()
'Load an existing PDF document.
Dim stream As FileStream = New FileStream("Input.pdf", FileMode.Open)
Dim document As PdfLoadedDocument = New PdfLoadedDocument(stream)
'Set OCR language.
processor.Settings.Language = Languages.English
'Assign rectangles to the page.
Dim rect As RectangleF = New RectangleF(0, 100, 950, 150)
Dim pageRegions As List(Of PageRegion) = New List(Of PageRegion)()
'Create page region.
Dim region As PageRegion = New PageRegion()
'Set page index.
region.PageIndex = 0
'Set page region.
region.PageRegions = New RectangleF() {rect}
'Add region to page region.
pageRegions.Add(region)
'Set page regions.
processor.Settings.Regions = pageRegions
'Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document)
'Create file stream.
Using outputFileStream As FileStream = New FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite)
'Save the PDF document to file stream.
document.Save(outputFileStream)
End Using
'Close the document.
document.Close(True)
End Using
You can downloaded a complete working sample from GitHub.
Performing OCR with tesseract version 3.05
The TesseractVersion property is used to switch the tesseract version between 3.02 and 3.05. By default, OCR works with tesseract version 3.02.
NOTE
The starting supported version of tesseract in ASP.NET Core is 4.0. So the lower tesseract versions 3.02 and 3.05 are not supported and we don’t have the property called
TesseractVersion
in ASP.NET Core platform.
The following code sample demonstrates the OCR processor with Tesseract version 3.05 for PDF documents.
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
//Load an existing PDF document.
FileStream stream = new FileStream("Input.pdf", FileMode.Open);
PdfLoadedDocument document = new PdfLoadedDocument(stream);
//Set OCR language.
processor.Settings.Language = Languages.English;
//Set tesseract OCR Engine.
processor.Settings.TesseractVersion = TesseractVersion.Version3_05;
//Perform OCR with input document, tessdata (Language packs) and enabling isMemoryOptimized property.
processor.PerformOCR(document);
//Create file stream.
using (FileStream outputFileStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite))
{
//Save the PDF document to file stream.
document.Save(outputFileStream);
}
//Close the document.
document.Close(true);
}
'Initialize the OCR processor with tesseract binaries folder path.
Using processor As OCRProcessor = New OCRProcessor("TesseractBinaries/3.05/")
'Load an existing PDF document.
Dim stream As FileStream = New FileStream("Input.pdf", FileMode.Open)
Dim document As PdfLoadedDocument = New PdfLoadedDocument(stream)
'Set OCR language.
processor.Settings.Language = Languages.English
'Set tesseract OCR Engine.
processor.Settings.TesseractVersion = TesseractVersion.Version3_05
'Perform OCR with input document, tessdata (Language packs) and enabling isMemoryOptimized property.
processor.PerformOCR(document)
'Create file stream.
Using outputFileStream As FileStream = New FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite)
'Save the PDF document to file stream.
document.Save(outputFileStream)
End Using
'Close the document.
document.Close(True)
End Using
Performing OCR with Tesseract Version 4.0
The TesseractVersion property is used to switch the tesseract version to 4.0. By default, OCR will be performed with tesseract version 3.02.
NOTE
In ASP.NET Core platform, the default and starting supported version of tesseract is 4.0. So we did not have the property called
TesseractVersion
in ASP.NET Core platform.
The following code sample explains the OCR processor with Tesseract version 4.0 for PDF documents.
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
//Load an existing PDF document.
FileStream stream = new FileStream("Input.pdf", FileMode.Open);
PdfLoadedDocument document = new PdfLoadedDocument(stream);
//Set OCR language.
processor.Settings.Language = Languages.English;
//Set tesseract OCR Engine.
processor.Settings.TesseractVersion = TesseractVersion.Version4_0;
//Perform OCR with input document, tessdata (Language packs) and enabling isMemoryOptimized property.
processor.PerformOCR(document);
//Create file stream.
using (FileStream outputFileStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite))
{
//Save the PDF document to file stream.
document.Save(outputFileStream);
}
//Close the document.
document.Close(true);
}
'Initialize the OCR processor with tesseract binaries folder path.
Using processor As OCRProcessor = New OCRProcessor("TesseractBinaries/4.0/")
'Load an existing PDF document.
Dim stream As FileStream = New FileStream("Input.pdf", FileMode.Open)
Dim document As PdfLoadedDocument = New PdfLoadedDocument(stream)
'Set OCR language.
processor.Settings.Language = Languages.English
'Set tesseract OCR Engine.
processor.Settings.TesseractVersion = TesseractVersion.Version4_0
'Perform OCR with input document, tessdata (Language packs) and enabling isMemoryOptimized property.
processor.PerformOCR(document)
'Create file stream.
Using outputFileStream As FileStream = New FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite)
'Save the PDF document to file stream.
document.Save(outputFileStream)
End Using
'Close the document.
document.Close(True)
End Using
Performing OCR on image
The below code example illustrates how to perform OCR on image file using PerformOCR method in OCRProcessor class.
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
//Load the input image.
FileStream imageStream = new FileStream("Input.jpg", FileMode.Open);
//Set OCR language.
processor.Settings.Language = Languages.English;
//Perform OCR with input document, tessdata (Language packs) and enabling isMemoryOptimized property.
String OCRText = processor.PerformOCR(imageStream);
}
'Initialize the OCR processor.
Using processor As OCRProcessor = New OCRProcessor()
'Load the input image.
Dim imageStream As FileStream = New FileStream("Input.jpg", FileMode.Open)
'Set OCR language.
processor.Settings.Language = Languages.English
'Perform OCR with input document, tessdata (Language packs) and enabling isMemoryOptimized property.
Dim ocrText As String = processor.PerformOCR(imageStream)
End Using
You can downloaded a complete working sample from GitHub.
You can get the OCRed Unicode text from an image file by using the UnicodeFont
property in OCRProcessor. For more information, refer to the following code sample.
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
//Load the input image.
FileStream imageStream = new FileStream("Input.jpg", FileMode.Open);
//Set OCR language.
processor.Settings.Language = Languages.English;
//Get stream from the font file.
FileStream fontStream = new FileStream(@"ARIALUNI.ttf", FileMode.Open);
//Sets Unicode font to preserve the Unicode characters in a PDF document.
processor.UnicodeFont = new PdfTrueTypeFont(fontStream, 8);
//Perform OCR with input document, tessdata (Language packs) and enabling isMemoryOptimized property.
String OCRText = processor.PerformOCR(imageStream);
}
//By default, Unicode characters can be extracted from image file in .NET Framework applications like WF, WPF, ASP.NET and ASP.NET MVC.
//By default, Unicode characters can be extracted from image file in .NET Framework applications like WF, WPF, ASP.NET and ASP.NET MVC.
Performing OCR for large PDF documents
To optimize memory to performing OCR on large PDF documents, enable the isMemoryOptimized
property in PerformOCR method of OCRProcessor class. Optimization will be effective only with Multithreading environment or PDF document with more images. For more details, refer to the following code examples.
NOTE
Memory optimization for performing OCR on large PDF documents is not supported in ASP.NET Core platform.
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
//Load an existing PDF document.
FileStream stream = new FileStream("Input.pdf", FileMode.Open);
PdfLoadedDocument document = new PdfLoadedDocument(stream);
//Set OCR language.
processor.Settings.Language = Languages.English;
//Perform OCR with input document, tessdata (Language packs) and enable isMemoryOptimized property.
processor.PerformOCR(document);
//Create file stream.
using (FileStream outputFileStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite))
{
//Save the PDF document to file stream.
document.Save(outputFileStream);
}
//Close the document.
document.Close(true);
}
'Initialize the OCR processor.
Using processor As OCRProcessor = New OCRProcessor()
'Load an existing PDF document.
Dim stream As FileStream = New FileStream("Input.pdf", FileMode.Open)
Dim document As PdfLoadedDocument = New PdfLoadedDocument(stream)
'Set OCR language.
processor.Settings.Language = Languages.English
'Perform OCR with input document, tessdata (Language packs) and enable isMemoryOptimized property.
processor.PerformOCR(document)
'Create file stream.
Using outputFileStream As FileStream = New FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite)
'Save the PDF document to file stream.
document.Save(outputFileStream)
End Using
'Close the document.
document.Close(True)
End Using
Performing OCR on rotated page of PDF document
You can get the OCRed text from the rotated page of PDF document using the PageSegment property by specifying AutoOsd
through PageSegmentMode Enum. For more details, refer to the following code sample.
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
//Load an existing PDF document.
FileStream stream = new FileStream("Input.pdf", FileMode.Open);
PdfLoadedDocument document = new PdfLoadedDocument(stream);
//Set OCR language.
processor.Settings.Language = Languages.English;
//Set OCR page auto detection rotation.
processor.Settings.PageSegment = PageSegMode.AutoOsd;
//Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document);
//Create file stream.
using (FileStream outputFileStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite))
{
//Save the PDF document to file stream.
document.Save(outputFileStream);
}
//Close the document.
document.Close(true);
}
'Initialize the OCR processor.
Using processor As OCRProcessor = New OCRProcessor()
'Load an existing PDF document.
Dim stream As FileStream = New FileStream("Input.pdf", FileMode.Open)
Dim document As PdfLoadedDocument = New PdfLoadedDocument(stream)
'Set OCR language.
processor.Settings.Language = Languages.English
'Set OCR page auto detection rotation.
processor.Settings.PageSegment = PageSegMode.AutoOsd
'Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document)
'Create file stream.
Using outputFileStream As FileStream = New FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite)
'Save the PDF document to file stream.
document.Save(outputFileStream)
End Using
'Close the document.
document.Close(True)
End Using
You can downloaded a complete working sample from GitHub.
Layout result from OCR
You can get the OCRed text and its bounds from a scanned PDF document by using the OCRLayoutResult Class. Refer to the following code sample.
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
//Load an existing PDF document.
FileStream stream = new FileStream("Input.pdf", FileMode.Open);
PdfLoadedDocument document = new PdfLoadedDocument(stream);
//Set OCR language.
processor.Settings.Language = Languages.English;
//Create the layout result.
OCRLayoutResult layoutResult = new OCRLayoutResult();
//Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document, @"Tessdata/", out layoutResult);
//Get OCRed line collection from first page.
OCRLineCollection lines = layoutResult.Pages[0].Lines;
//Get each OCR'ed line and its bounds.
foreach (Line line in lines)
{
string text = line.Text;
RectangleF bounds = line.Rectangle;
}
//Create file stream.
using (FileStream outputFileStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite))
{
//Save the PDF document to file stream.
document.Save(outputFileStream);
}
//Close the document.
document.Close(true);
}
'Initialize the OCR processor.
Using processor As OCRProcessor = New OCRProcessor()
'Load an existing PDF document.
Dim stream As FileStream = New FileStream("Input.pdf", FileMode.Open)
Dim document As PdfLoadedDocument = New PdfLoadedDocument(stream)
'Set OCR language.
processor.Settings.Language = Languages.English
'Create the layout result.
Dim layoutResult As OCRLayoutResult = New OCRLayoutResult()
'Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document, "Tessdata/", layoutResult)
'Get OCR'ed line collection from first page.
Dim lines As OCRLineCollection = layoutResult.Pages(0).Lines
'Get each OCR'ed line and its bounds.
For Each line As Line In lines
Dim text As String = line.Text
Dim bounds As RectangleF = line.Rectangle
Next
'Create file stream.
Using outputFileStream As FileStream = New FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite)
'Save the PDF document to file stream.
document.Save(outputFileStream)
End Using
'Close the document.
document.Close(True)
End Using
You can downloaded a complete working sample from GitHub.
Native call
Enabling native calls will not launch any temporary process for OCR processing; instead, it will invoke the native calls.
NOTE
The starting supported version of tesseract in ASP.NET Core is 4.0. So, the lower tesseract versions 3.02 and 3.05 are not supported and we don’t have the property called
TesseractVersion
andEnableNativeCall
in ASP.NET Core platform.
Tesseract 3.02
Tesseract 3.02 supports only 32-bit version. By default, OCR works with this tesseract version 3.02.
NOTE
Enabling native calls will not work in 64-bit tesseract 3.02 version. Instead, a temporary process will be launched for OCR processing.
The following code sample demonstrates the OCR processor with native call support of tesseract 3.02 by setting TesseractVersion as 3.02.
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
//Load an existing PDF document.
FileStream stream = new FileStream("Input.pdf", FileMode.Open);
PdfLoadedDocument document = new PdfLoadedDocument(stream);
//Set OCR language.
processor.Settings.Language = Languages.English;
//Set tesseract OCR Engine.
processor.Settings.TesseractVersion = TesseractVersion.Version3_02;
//Perform OCR with input document, tessdata (Language packs) and enabling isMemoryOptimized property.
processor.PerformOCR(document);
//Create file stream.
using (FileStream outputFileStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite))
{
//Save the PDF document to file stream.
document.Save(outputFileStream);
}
//Close the document.
document.Close(true);
}
'Initialize the OCR processor.
Using processor As OCRProcessor = New OCRProcessor()
'Load an existing PDF document.
Dim stream As FileStream = New FileStream("Input.pdf", FileMode.Open)
Dim document As PdfLoadedDocument = New PdfLoadedDocument(stream)
'Set OCR language.
processor.Settings.Language = Languages.English
'Set tesseract OCR Engine.
processor.Settings.TesseractVersion = TesseractVersion.Version3_02
'Perform OCR with input document, tessdata (Language packs) and enabling isMemoryOptimized property.
processor.PerformOCR(document)
'Create file stream.
Using outputFileStream As FileStream = New FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite)
'Save the PDF document to file stream.
document.Save(outputFileStream)
End Using
'Close the document.
document.Close(True)
End Using
Tesseract 3.05
Tesseract 3.05 supports the native call for both x86 and x64 architectures. By default, the x86 tesseract binaries are available with Syncfusion NuGet package or the tesseract installer.
You can download the x64 supporting tesseract binaries from the following link.
Tesseract 64-bit binaries
NOTE
This 64-bit binaries are required only when the native call property is enabled.
Make sure to provide the 64-bit binaries path while using the 64-bit environment.
The following code sample demonstrates the OCR processor with native call support of tesseract 3.05 by setting TesseractVersion as 3.05.
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
//Load an existing PDF document.
FileStream stream = new FileStream("Input.pdf", FileMode.Open);
PdfLoadedDocument document = new PdfLoadedDocument(stream);
//Set OCR language.
processor.Settings.Language = Languages.English;
//Set tesseract OCR Engine.
processor.Settings.TesseractVersion = TesseractVersion.Version3_05;
//Set enable native call.
processor.Settings.EnableNativeCall = true;
//Perform OCR with input document, tessdata (Language packs) and enabling isMemoryOptimized property.
processor.PerformOCR(document);
//Create file stream.
using (FileStream outputFileStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite))
{
//Save the PDF document to file stream.
document.Save(outputFileStream);
}
//Close the document.
document.Close(true);
}
'Initialize the OCR processor with tesseract binaries folder path.
Using processor As OCRProcessor = New OCRProcessor()
'Load an existing PDF document.
Dim stream As FileStream = New FileStream("Input.pdf", FileMode.Open)
Dim document As PdfLoadedDocument = New PdfLoadedDocument(stream)
'Set OCR language.
processor.Settings.Language = Languages.English
'Set tesseract OCR Engine.
processor.Settings.TesseractVersion = TesseractVersion.Version3_05
'Set enable native call
processor.Settings.EnableNativeCall = True
'Perform OCR with input document, tessdata (Language packs) and enabling isMemoryOptimized property.
processor.PerformOCR(document)
'Create file stream.
Using outputFileStream As FileStream = New FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite)
'Save the PDF document to file stream.
document.Save(outputFileStream)
End Using
'Close the document.
document.Close(True)
End Using
Advantages of Native Call over Normal API
Enabling this property will process OCR with native calls (PInvoke) instead of surrogate process.
For surrogate process, it requires permission for creating and executing a process, whereas the native calls (PInvoke) does not required. And also, performance will be better in PInvoke instead of surrogate process.
Customizing temp folder
While performing OCR on an existing scanned PDF document, the OCR Processor will create temporary files (.temp, .tiff, .txt) and the files are deleted after the process is completed. You can change this temporary files folder location using the TempFolder property available in the OCRSettings Instance. Refer to the following code sample to change the path of temp folder when performing the OCR.
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
//Load an existing PDF document.
FileStream stream = new FileStream("Input.pdf", FileMode.Open);
PdfLoadedDocument document = new PdfLoadedDocument(stream);
//Set OCR language.
processor.Settings.Language = Languages.English;
//Set custom temp file path location.
processor.Settings.TempFolder = "D:/Temp/";
//Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document);
//Create file stream.
using (FileStream outputFileStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite))
{
//Save the PDF document to file stream.
document.Save(outputFileStream);
}
//Close the document.
document.Close(true);
}
'Initialize the OCR processor.
Using processor As OCRProcessor = New OCRProcessor()
'Load an existing PDF document.
Dim stream As FileStream = New FileStream("Input.pdf", FileMode.Open)
Dim document As PdfLoadedDocument = New PdfLoadedDocument(stream)
'Set OCR language.
processor.Settings.Language = Languages.English
'Set custom temp file path location.
processor.Settings.TempFolder = "D:/Temp/"
'Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document)
'Create file stream.
Using outputFileStream As FileStream = New FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite)
'Save the PDF document to file stream.
document.Save(outputFileStream)
End Using
'Close the document.
document.Close(True)
End Using
You can downloaded a complete working sample from GitHub.
Performing OCR with different Page Segmentation Mode
The PageSegment property is used to set the page segmentation mode. By default, OCR works with the “Auto” page segmentation mode. Kindly refer to the following code example to perform OCR with different page segmentation mode.
NOTE
The page segmentation mode is supported only in the Tesseract version 4.0 and above.
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
//Load an existing PDF document.
FileStream stream = new FileStream("Input.pdf", FileMode.Open);
PdfLoadedDocument document = new PdfLoadedDocument(stream);
//Set OCR language.
processor.Settings.Language = Languages.English;
//** For .NET Framework only **.
//processor.Settings.TesseractVersion = TesseractVersion.Version4_0;
//Set OCR Page segment mode to process.
processor.Settings.PageSegment = PageSegMode.AutoOsd;
//Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document);
//Create file stream.
using (FileStream outputFileStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite))
{
//Save the PDF document to file stream.
document.Save(outputFileStream);
}
//Close the document.
document.Close(true);
}
'Initialize the OCR processor.
Using processor As OCRProcessor = New OCRProcessor()
'Load an existing PDF document.
Dim stream As FileStream = New FileStream("Input.pdf", FileMode.Open)
Dim document As PdfLoadedDocument = New PdfLoadedDocument(stream)
'Set OCR language.
processor.Settings.Language = Languages.English
' ** For .NET Framework only **.
'processor.Settings.TesseractVersion = TesseractVersion.Version4_0
'Set OCR Page segment mode to process.
processor.Settings.PageSegment = PageSegMode.AutoOsd
'Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document)
'Create file stream.
Using outputFileStream As FileStream = New FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite)
'Save the PDF document to file stream.
document.Save(outputFileStream)
End Using
'Close the document.
document.Close(True)
End Using
Performing OCR with different OCR Engine Mode
The OCREngineMode property is used to set the OCR Engine modes. By default, OCR works with OCR Engine mode “Default”. Refer to the following code example to perform OCR with different OCR engine mode.
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
//Load an existing PDF document.
FileStream stream = new FileStream("Input.pdf", FileMode.Open);
PdfLoadedDocument document = new PdfLoadedDocument(stream);
//Set OCR language.
processor.Settings.Language = Languages.English;
//Set tesseract version. ** For .NET Framework only. **
//processor.Settings.TesseractVersion = TesseractVersion.Version4_0;
//Set OCR engine mode to process.
processor.Settings.OCREngineMode = OCREngineMode.LSTMOnly;
//Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document);
//Create file stream.
using (FileStream outputFileStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite))
{
//Save the PDF document to file stream.
document.Save(outputFileStream);
}
//Close the document.
document.Close(true);
}
'Initialize the OCR processor.
Using processor As OCRProcessor = New OCRProcessor()
'Load an existing PDF document.
Dim stream As FileStream = New FileStream("Input.pdf", FileMode.Open)
Dim document As PdfLoadedDocument = New PdfLoadedDocument(stream)
'Set OCR language.
processor.Settings.Language = Languages.English
'Set tesseract version. ** For .NET Framework only. **
'processor.Settings.TesseractVersion = TesseractVersion.Version4_0
'Set OCR engine mode to process.
processor.Settings.OCREngineMode = OCREngineMode.LSTMOnly
'Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document)
'Create file stream.
Using outputFileStream As FileStream = New FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite)
'Save the PDF document to file stream.
document.Save(outputFileStream)
End Using
'Close the document.
document.Close(True)
End Using
NOTE
The OCR Engine Mode is supported only in the Tesseract version 4.0 and above.
White List
The WhiteList property specifies a list of characters that the OCR engine is only allowed to recognize. If a character is not on the white list, it will not be included in the output OCR results. For more information, refer to the following code sample.
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
//Load an existing PDF document.
FileStream stream = new FileStream("Input.pdf", FileMode.Open);
PdfLoadedDocument document = new PdfLoadedDocument(stream);
//Set OCR language.
processor.Settings.Language = Languages.English;
//Set tesseract version. ** For .NET Framework only. **
//processor.Settings.TesseractVersion = TesseractVersion.Version4_0;
//Set OCR engine mode to process.
processor.Settings.OCREngineMode = OCREngineMode.LSTMOnly;
//Set WhiteList Property.
processor.Settings.WhiteList = "PDF";
//Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document);
//Create file stream.
using (FileStream outputFileStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite))
{
//Save the PDF document to file stream.
document.Save(outputFileStream);
}
//Close the document.
document.Close(true);
}
'Initialize the OCR processor.
Using processor As OCRProcessor = New OCRProcessor()
'Load an existing PDF document.
Dim stream As FileStream = New FileStream("Input.pdf", FileMode.Open)
Dim document As PdfLoadedDocument = New PdfLoadedDocument(stream)
'Set OCR language.
processor.Settings.Language = Languages.English
'Set tesseract version. ** For .NET Framework only. **
'processor.Settings.TesseractVersion = TesseractVersion.Version4_0
'Set OCR engine mode to process.
processor.Settings.OCREngineMode = OCREngineMode.LSTMOnly
'Set WhiteList Property.
processor.Settings.WhiteList = "PDF"
'Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document)
'Create file stream.
Using outputFileStream As FileStream = New FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite)
'Save the PDF document to file stream.
document.Save(outputFileStream)
End Using
'Close the document.
document.Close(True)
End Using
Black List
The BlackList property specifies the characters that exclude from the character set used for recognition and the OCR will not return any of the characters you are specified in the list. For more information, refer to the following code sample.
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
//Load an existing PDF document.
FileStream stream = new FileStream("Input.pdf", FileMode.Open);
PdfLoadedDocument document = new PdfLoadedDocument(stream);
//Set OCR language.
processor.Settings.Language = Languages.English;
//Set tesseract version. ** For .NET Framework only. **
//processor.Settings.TesseractVersion = TesseractVersion.Version4_0;
//Set OCR engine mode to process.
processor.Settings.OCREngineMode = OCREngineMode.LSTMOnly;
//Set BlackList Property.
processor.Settings.BlackList = "PDF";
//Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document);
//Create file stream.
using (FileStream outputFileStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite))
{
//Save the PDF document to file stream.
document.Save(outputFileStream);
}
//Close the document.
document.Close(true);
}
'Initialize the OCR processor.
Using processor As OCRProcessor = New OCRProcessor()
'Load an existing PDF document.
Dim stream As FileStream = New FileStream("Input.pdf", FileMode.Open)
Dim document As PdfLoadedDocument = New PdfLoadedDocument(stream)
'Set OCR language.
processor.Settings.Language = Languages.English
'Set tesseract version. ** For .NET Framework only. **
'processor.Settings.TesseractVersion = TesseractVersion.Version4_0
'Set OCR engine mode to process.
processor.Settings.OCREngineMode = OCREngineMode.LSTMOnly
'Set BlackList Property.
processor.Settings.BlackList = "PDF"
'Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document)
'Create file stream.
Using outputFileStream As FileStream = New FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite)
'Save the PDF document to file stream.
document.Save(outputFileStream)
End Using
'Close the document.
document.Close(True)
End Using
OCR an Image to PDF
You can perform OCR on an image and convert it to a searchable PDF document. It is also possible to specify the conformance level through PdfConformanceLevel Enum to the output PDF document using OCR processor settings.
NOTE
This PDF conformance option only applies for image OCR to PDF documents.
The following code sample illustrates how to OCR an image to a PDF document:
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
//Get stream from an image file.
FileStream imageStream = new FileStream(@"Input.jpg", FileMode.Open);
//Set OCR language to process.
processor.Settings.Language = Languages.English;
//Sets Unicode font to preserve the Unicode characters in a PDF document.
FileStream fontStream = new FileStream(@"ARIALUNI.ttf", FileMode.Open);
//Set the unicode font.
processor.UnicodeFont = new PdfTrueTypeFont(fontStream, true, PdfFontStyle.Regular, 10);
//Set the PDF conformance level.
processor.Settings.Conformance = PdfConformanceLevel.Pdf_A1B;
//Process OCR by providing the bitmap image.
PdfDocument document = processor.PerformOCR(imageStream);
//Create file stream.
using (FileStream outputFileStream = new FileStream(@"Output.pdf", FileMode.Create, FileAccess.ReadWrite))
{
//Save the PDF document to file stream.
document.Save(outputFileStream);
}
//Close the document.
document.Close(true);
}
'Initialize the OCR processor.
Using processor As OCRProcessor = New OCRProcessor()
'Get stream from an image file.
Dim imageStream As FileStream = New FileStream("Input.jpg", FileMode.Open)
'Set OCR language to process.
processor.Settings.Language = Languages.English
'Sets Unicode font to preserve the Unicode characters in a PDF document.
Dim fontStream As FileStream = New FileStream("ARIALUNI.ttf", FileMode.Open)
'Set the unicode font.
processor.UnicodeFont = New PdfTrueTypeFont(fontStream, True, PdfFontStyle.Regular, 10)
'Set the PDF conformance level.
processor.Settings.Conformance = PdfConformanceLevel.Pdf_A1B
'Process OCR by providing the bitmap image.
Dim document As PdfDocument = processor.PerformOCR(imageStream)
'Create file stream.
Using outputFileStream As FileStream = New FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite)
'Save the PDF document to file stream.
document.Save(outputFileStream)
End Using
'Close the document.
document.Close(True)
End Using
You can downloaded a complete working sample from GitHub.
Performing OCR with Unicode characters
You can perform OCR on Images with Unicode characters. To preserve the Unicode characters in the PDF document, use the UnicodeFont
property. For more information, refer to the following code sample.
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
//Load an existing PDF document.
FileStream stream = new FileStream("Input.pdf", FileMode.Open);
PdfLoadedDocument document = new PdfLoadedDocument(stream);
//Set OCR language.
processor.Settings.Language = Languages.English;
//Sets Unicode font to preserve the Unicode characters in a PDF document.
FileStream fontStream = new FileStream(@"ARIALUNI.ttf", FileMode.Open);
processor.UnicodeFont = new PdfTrueTypeFont(fontStream, 8);
//Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document);
//Create file stream.
using (FileStream outputFileStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite))
{
//Save the PDF document to file stream.
document.Save(outputFileStream);
}
//Close the document.
document.Close(true);
}
//By default unicode characters can be extracted from image file in .NET Framework applications like WF, WPF, ASP.NET and ASP.NET MVC.
//By default unicode characters can be extracted from image file in .NET Framework applications like WF, WPF, ASP.NET and ASP.NET MVC.
You can downloaded a complete working sample from GitHub.
Best Practices
You can improve the accuracy of the OCR process by choosing the correct compression method when converting the scanned paper to a TIFF image and then to a PDF document.
- Use (zip) lossless compression for color or gray-scale images.
- Use CCITT Group 4 or JBIG2 (lossless) compression for monochrome images. This ensures that optical character recognition works on the highest-quality image, thereby improving the OCR accuracy. This is especially useful in low-resolution scans.
- In addition, rotated images and skewed images can also affect the accuracy and readability of the OCR process.
Tesseract works best with text when at least 300 dots per inch (DPI) are used, so it is beneficial to resize images.
For more details regarding quality improvement, refer to the following link.
https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality
You can set the different performance level to the OCRProcessor using Performance enumeration.
- Rapid - high speed OCR performance and provide normal OCR accuracy.
- Fast - provides moderate OCR processing speed and accuracy.
- Slow - Slow OCR performance and provide best OCR accuracy.
Refer to the following code sample to set the performance of the OCR.
//Initialize the OCR processor
OCRProcessor processor = new OCRProcessor();
//set the OCR performance
processor.Settings.Performance = Performance.Fast;
'Initialize the OCR processor
Dim processor As New OCRProcessor()
'Set the OCR performance
processor.Settings.Performance = Performance.Fast
TesseractBinaries Paths and Tesseract Language Data
Starting with v21.1.x, TesseractBinaries, and Tesseract language data folder paths are added by default. So, there is no need to provide these paths explicitly. However, you can refer to TesseractBinaries and Tessdata paths manually in your application as per the requirement.
NOTE
You can get the TesseractBinaries or TessData files from the NuGet package run times folder or bin folder of the application.
//Initialize the OCR processor by providing the path of the tesseract binaries (SyncfusionTesseract.dll and liblept168.dll)
using (OCRProcessor processor = new OCRProcessor(@"TesseractBinaries\"))
{
//Load an existing PDF document.
FileStream stream = new FileStream("Input.pdf", FileMode.Open);
PdfLoadedDocument document = new PdfLoadedDocument(stream);
//Set OCR language to process.
processor.Settings.Language = Languages.English;
//Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document, @"TessData\");
//Create file stream.
using (FileStream outputFileStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite))
{
//Save the PDF document to file stream.
document.Save(outputFileStream);
}
//Close the document.
document.Close(true);
}
'Initialize the OCR processor
Using processor As OCRProcessor = New OCRProcessor("TesseractBinaries\")
'Load an existing PDF document.
Dim stream As FileStream = New FileStream("Input.pdf", FileMode.Open)
Dim document As PdfLoadedDocument = New PdfLoadedDocument(stream)
'Set OCR language to process.
processor.Settings.Language = Languages.English
'Perform OCR with input document and tessdata (Language packs).
processor.PerformOCR(document, "TessData\")
'Create file stream.
Using outputFileStream As FileStream = New FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite)
'Save the PDF document to file stream.
document.Save(outputFileStream)
End Using
'Close the document.
document.Close(True)
End Using
Get image rotation angle from OCR processor
To get the Image rotation angle, you can rotate the image with 4 angles (0,90,180, and 360) from the OCR Processor. This feature works in multiple Images and multiple pages. The following code sample illustrates support for the Image Rotation angle from the OCR Processor.
//Initialize the OCR processor.@
using (OCRProcessor processor = new OCRProcessor())
{
//Get the stream from an image file.
FileStream stream = new FileStream(@"D:\Input.pdf", FileMode.Open);
//Set the OCR language to process.
PdfLoadedDocument document = new PdfLoadedDocument(stream);
//Set the OCR language.
processor.Settings.Language = Languages.English;
//Set the Unicode font to preserve the Unicode characters in a PDF document.
processor.TesseractPath = @"D:\Tesseractbinaries_core\Windows\x64";
processor.PerformOCR(document, 0, 0, @"D:\tessdata", out OCRLayoutResult result);
float angle = 0;
if (result != null)
{
foreach (var page in result.Pages)
{
angle = page.ImageRotation;
if (angle == 180)
{
document.Pages[0].Rotation = PdfPageRotateAngle.RotateAngle180;
}
}
}
//Create file stream.
using (FileStream outputFileStream = new FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite))
{
//Save the PDF document to file stream.
document.Save(outputFileStream);
}
}
'Initialize the OCR processor
Using processor As OCRProcessor = New OCRProcessor()
'Get the stream from an image file.
Dim stream As FileStream = New FileStream(@"D:\Input.pdf", FileMode.Open);
'Set the OCR language to process.
Dim document As PdfLoadedDocument = New PdfLoadedDocument(stream);
'Set the OCR language.
processor.Settings.Language = Languages.English;
'Set the Unicode font to preserve the Unicode characters in a PDF document.
processor.TesseractPath = @"D:\Tesseractbinaries_core\Windows\x64";
processor.PerformOCR(document, 0, 0, @"D:\tessdata", out OCRLayoutResult result);
float angle = 0;
If result IsNot Nothing Then
For Each page As var In result.Pages
angle = page.ImageRotation
If angle = 180 Then
document.Pages(0).Rotation = PdfPageRotateAngle.RotateAngle180
End If
Next
End If
'Create file stream.
Using outputFileStream As FileStream = New FileStream("Output.pdf", FileMode.Create, FileAccess.ReadWrite)
'Save the PDF document to file stream.
document.Save(outputFileStream)
End Using
'Close the document.
document.Close(True)
End Using
You can downloaded a complete working sample from GitHub.
Image Enhancement in OCR Processor library
We have support to improve the image quality while performing OCR for an image or PDF document. In this process, we can enhance the image quality by using binarization, grayscale, and resolution enhancement methods with third-party libraries. Please refer to the code snippet below.
//Initialize the OCR processor.
using (OCRProcessor processor = new OCRProcessor())
{
//Load an existing PDF document.
FileStream stream = new FileStream("../../../document.pdf", FileMode.Open);
PdfLoadedDocument lDoc = new PdfLoadedDocument(stream);
//Set the OCR language.
processor.Settings.Language = Languages.English;
processor.ImageProcessor = new ImageProcessor();
//Perform OCR with input document.
string text = processor.PerformOCR(lDoc);
//Create file stream.
FileStream fileStream = new FileStream("../../../OCR.pdf", FileMode.CreateNew);
//Save the document into stream.
lDoc.Save(fileStream);
//Close the document.
lDoc.Close(true);
stream.Dispose();
fileStream.Dispose();
}
' Initialize the OCR processor.
Using processor As New OCRProcessor()
' Load an existing PDF document.
Dim stream As New FileStream("../../../document.pdf", FileMode.Open)
Dim lDoc As New PdfLoadedDocument(stream)
' Set the OCR language.
processor.Settings.Language = Languages.English
processor.ImageProcessor = New ImageProcessor()
' Perform OCR with input document.
Dim text As String = processor.PerformOCR(lDoc)
' Create file stream.
Dim fileStream As New FileStream("../../../OCR.pdf", FileMode.CreateNew)
' Save the document into stream.
lDoc.Save(fileStream)
' Close the document.
lDoc.Close(true)
stream.Dispose()
fileStream.Dispose()
End Using
NOTE
Note: In this sample, we are using the SixLabors.ImageSharp library to improve the image quality. You can any image processing library as per your requirement.
You can downloaded a complete working sample from GitHub.
OCR with multiple languages
Syncfusion OCR processor does support multiple languages in C#. You can configure the OCR processor to recognize text in multiple languages by specifying the required language files.
Here’s a general outline of how to enable multiple languages in Syncfusion OCR processor:
Install Required Dependencies:
Ensure you have installed the necessary NuGet packages, including Syncfusion.OCRProcessor
and Tesseract
, for OCR functionalities.
Set Up OCR Processor:
You need to download the language data files (.traineddata) for the languages you want to use. These files are required by the OCR engine to recognize different languages.
Load the Language Files:
You can set up multiple languages by specifying the language codes (e.g., “eng” for English, “fra” for French) and ensuring that the trained data for those languages is available.
Here is a basic example of using Syncfusion OCR processor with multiple languages in C#:
// Initialize the OCR processor within a using block to ensure resources are properly disposed
using (OCRProcessor ocrProcessor = new OCRProcessor())
{
// Set the Unicode font for the OCR processor using a TrueType font file
ocrProcessor.UnicodeFont = new Syncfusion.Pdf.Graphics.PdfTrueTypeFont(
new FileStream("arialuni.ttf", FileMode.Open), // Path to the TrueType font file
12 // Font size
);
// Open the PDF file to be processed
FileStream fileStream = new FileStream("Input.pdf", FileMode.Open);
// Load the PDF document from the file stream
PdfLoadedDocument loadedDocument = new PdfLoadedDocument(fileStream);
// Configure OCR settings
OCRSettings ocrSettings = new OCRSettings();
// Specify the languages to be used for OCR
ocrSettings.Language = "eng+deu+ara+ell+fra"; // English, German, Arabic, Greek, French
// Apply the OCR settings to the OCR processor
ocrProcessor.Settings = ocrSettings;
// Perform OCR on the loaded PDF document, providing the path to the tessdata directory
ocrProcessor.PerformOCR(loadedDocument, "tessdata");
// Create a file stream to save the OCR-processed PDF
FileStream outputFileStream = new FileStream("OCR_Output.pdf", FileMode.Create);
// Save the OCR-processed document to the file stream
loadedDocument.Save(outputFileStream);
// Close the loaded document and commit changes
loadedDocument.Close(true);
// Close the file streams
outputFileStream.Close();
fileStream.Close();
}
// Initialize the OCR processor within a using block to ensure resources are properly disposed
using (OCRProcessor ocrProcessor = new OCRProcessor())
{
// Set the Unicode font for the OCR processor using a TrueType font file
ocrProcessor.UnicodeFont = new Syncfusion.Pdf.Graphics.PdfTrueTypeFont(
new FileStream("arialuni.ttf", FileMode.Open), // Path to the TrueType font file
12 // Font size
);
// Open the PDF file to be processed
FileStream fileStream = new FileStream("Input.pdf", FileMode.Open);
// Load the PDF document from the file stream
PdfLoadedDocument loadedDocument = new PdfLoadedDocument(fileStream);
// Configure OCR settings
OCRSettings ocrSettings = new OCRSettings();
// Specify the languages to be used for OCR
ocrSettings.Language = "eng+deu+ara+ell+fra"; // English, German, Arabic, Greek, French
// Apply the OCR settings to the OCR processor
ocrProcessor.Settings = ocrSettings;
// Perform OCR on the loaded PDF document, providing the path to the tessdata directory
ocrProcessor.PerformOCR(loadedDocument, "tessdata");
// Save the OCR-processed document to the file save
loadedDocument.Save("OCR_Output.pdf");
// Close the loaded document and commit changes
loadedDocument.Close(true);
}
' Initialize the OCR processor within a Using block to ensure resources are properly disposed
Using ocrProcessor As New OCRProcessor()
' Set the Unicode font for the OCR processor using a TrueType font file
ocrProcessor.UnicodeFont = New Syncfusion.Pdf.Graphics.PdfTrueTypeFont(
New FileStream("arialuni.ttf", FileMode.Open), ' Path to the TrueType font file
12 ' Font size
)
' Open the PDF file to be processed
Using fileStream As New FileStream("Input.pdf", FileMode.Open)
' Load the PDF document from the file stream
Dim loadedDocument As New PdfLoadedDocument(fileStream)
' Configure OCR settings
Dim ocrSettings As New OCRSettings()
' Specify the languages to be used for OCR
ocrSettings.Language = "eng+deu+ara+ell+fra" ' English, German, Arabic, Greek, French
' Apply the OCR settings to the OCR processor
ocrProcessor.Settings = ocrSettings
' Perform OCR on the loaded PDF document, providing the path to the tessdata directory
ocrProcessor.PerformOCR(loadedDocument, "tessdata")
' Save the OCR-processed document to the specified file
Using fileSaveStream As New FileStream("OCR_Output.pdf", FileMode.Create)
loadedDocument.Save(fileSaveStream)
End Using
' Close the loaded document and commit changes
loadedDocument.Close(True)
End Using
End Using
You can find the .traineddata
files for different languages on the Tesseract GitHub page.
You can downloaded a complete working sample from GitHub.