Text Extraction in JavaScript PDF library

18 Dec 202517 minutes to read

The PDF allows you to extract the text from a particular page or the entire PDF document.

NOTE

For redaction features, you need to install the @syncfusion/ej2-pdf-data-extract package as an add-on.

Working with basic text extraction

This example demonstrates how to extract text from a PDF page using the PdfDataExtractor class. Basic text extraction allows retrieving plain text content from a PDF document.

TypeScript
JavaScript
import { PdfDocument } from '@syncfusion/ej2-pdf';
import { PdfDataExtractor } from '@syncfusion/ej2-pdf-data-extract';

// Load an existing PDF document
let document: PdfDocument = new PdfDocument(data);
// Initialize a new instance of the `PdfDataExtractor` class
let extractor: PdfDataExtractor = new PdfDataExtractor(document);
// Extract text content from the PDF document.
let text: string = extractor.extractText();
// Save the document
document.save('Output.pdf');
// Close the document
document.destroy();
// Load an existing PDF document
var document = new ej.pdf.PdfDocument(data);
// Initialize a new instance of the PdfDataExtractor class
var extractor = new ej.pdfdataextract.PdfDataExtractor(document);
// Extract text content from the PDF document
var text = extractor.extractText();
// Save the document
document.save('Output.pdf');
// Close the document
document.destroy();

Extract text from specific page range in a PDF document

This example demonstrates how to extract text from a PDF document by specifying a start and end page number. This approach allows you to retrieve text content from a defined range of pages for processing or analysis.

TypeScript
JavaScript
import { PdfDocument } from '@syncfusion/ej2-pdf';
import { PdfDataExtractor } from '@syncfusion/ej2-pdf-data-extract';

// Load an existing PDF document
let document: PdfDocument = new PdfDocument(data);
// Initialize a new instance of the `PdfDataExtractor` class
let extractor: PdfDataExtractor = new PdfDataExtractor(document);
// Extract text content from the PDF document.
let text: string = extractor.extractText({ startPageIndex: 0, endPageIndex: document.pageCount - 1 });
// Save the document
document.save('Output.pdf');
// Close the document
document.destroy();
// Load an existing PDF document
var document = new ej.pdf.PdfDocument(data);
// Initialize a new instance of the PdfDataExtractor class
var extractor = new ej.pdfdataextract.PdfDataExtractor(document);
// Extract text content from the PDF document
var text = extractor.extractText({ startPageIndex: 0, endPageIndex: document.pageCount - 1 });
// Save the document
document.save('Output.pdf');
// Close the document
document.destroy();

Working with layout based text extraction

This example demonstrates how to extract text from a PDF page using the PdfDataExtractor class with layout-based options.

TypeScript
JavaScript
import { PdfDocument } from '@syncfusion/ej2-pdf';
import { PdfDataExtractor } from '@syncfusion/ej2-pdf-data-extract';

// Load an existing PDF document
let document: PdfDocument = new PdfDocument(data);
// Initialize a new instance of the `PdfDataExtractor` class
let extractor: PdfDataExtractor = new PdfDataExtractor(document);
// Extracts text from the PDF Page based on its layout
let text: string = extractor.extractText({isLayout: true});
// Save the document
document.save('Output.pdf');
// Close the document
document.destroy();
// Load an existing PDF document
var document = new ej.pdf.PdfDocument(data);
// Initialize a new instance of the PdfDataExtractor class
var extractor = new ej.pdfdataextract.PdfDataExtractor(document);
// Extracts text from the PDF Page based on its layout
var text = extractor.extractText({ isLayout: true });
// Save the document
document.save('Output.pdf');
// Close the document
document.destroy();

NOTE

Layout based text extraction may take additional processing time when compared to the normal extraction mode.

Text extraction with bounds

Working with lines

This example demonstrates how to extract text from a PDF page based on individual lines using the extractTextLines method. This approach provides a collection of TextLine objects, allowing precise access to text content line by line.

TypeScript
JavaScript
import { PdfDocument } from '@syncfusion/ej2-pdf';
import { PdfDataExtractor, TextLine } from '@syncfusion/ej2-pdf-data-extract';

// Load an existing PDF document
let document: PdfDocument = new PdfDocument(data);
// Initialize a new instance of the `PdfDataExtractor` class
let extractor: PdfDataExtractor = new PdfDataExtractor(document);
// Extract `TextLine` from the PDF document.
let textLines: Array<TextLine> = extractor.extractTextLines({ startPageIndex: 0, endPageIndex: document.pageCount-1});
// Iterate through each text line in the collection
textLines.forEach((textLine: TextLine) => {
    // Gets the bounds of the text line.
    let lineBounds: Rectangle = textLine.bounds;
    // Gets the single line of extracted text from the PDF page.
    let line: string = textLine.text;
    // Gets the page index of the text line extracted.
    let pageIndex: number = textLine.pageIndex;
    // Gets the collection of text words extracted from a specified page in a PDF document.
    let words: TextWord[] = textLine.words;
    // Gets the name of the font used for a particular line of text.
    let fontName: string = textLine.fontName;
    // Gets the font style used for a particular line of text.
    let fontStyle: PdfFontStyle = textLine.fontStyle;
    // Gets the font size used for a particular line of text.
    let fontSize: number = textLine.fontSize;
});
// Save the document
document.save('output.pdf');
// Destroy the document
document.destroy();
// Load an existing PDF document
var document = new ej.pdf.PdfDocument(data);
// Initialize a new instance of the `PdfDataExtractor` class
var extractor = new ej.pdfdataextract.PdfDataExtractor(document);
// Extract `TextLine` from the PDF document.
var textLines = extractor.extractTextLines({ startPageIndex: 0, endPageIndex: document.pageCount-1});
// Iterate through each text line in the collection
textLines.forEach((textLine) => {
    // Gets the bounds of the text line.
    var lineBounds = textLine.bounds;
    // Gets the single line of extracted text from the PDF page.
    var line = textLine.text;
    // Gets the page index of the text line extracted.
    var pageIndex = textLine.pageIndex;
    // Gets the collection of text words extracted from a specified page in a PDF document.
    var words = textLine.words;
    // Gets the name of the font used for a particular line of text.
    var fontName = textLine.fontName;
    // Gets the font style used for a particular line of text.
    var fontStyle = textLine.fontStyle;
    // Gets the font size used for a particular line of text.
    var fontSize = textLine.fontSize;
});
// Save the document
document.save('output.pdf');
// Destroy the document
document.destroy();

Working with words

This example demonstrates how to extract words from a PDF document using the extractTextLines method. Each line contains a collection of TextWord objects.

TypeScript
JavaScript
import { PdfDocument } from '@syncfusion/ej2-pdf';
import { PdfDataExtractor, TextLine, TextWord, PdfFontStyle } from '@syncfusion/ej2-pdf-data-extract';

// Load an existing PDF document
let document: PdfDocument = new PdfDocument(data);
// Initialize a new instance of the `PdfDataExtractor` class
let extractor: PdfDataExtractor = new PdfDataExtractor(document);
// Extract `TextLine` from the PDF document.
let textLines: Array<TextLine> = extractor.extractTextLines({ startPageIndex: 0, endPageIndex: document.pageCount-1});
textLines.forEach((textLine: TextLine) => {
    textLine.words.forEach((textWord: TextWord) => {
        // Gets the bounds of the text word.
        let wordBounds: Rectangle = textWord.bounds;
        // Gets the single word of extracted text from the PDF page.
        let word: string = textWord.text;
        // Gets the collection of text glyphs extracted from a specified page in a PDF document.
        let glyphs: TextGlyph[] = textWord.glyphs;
        // Gets the name of the font used for a particular word.
        let wordFontName: string = textWord.fontName;
        // Gets the style of the font used for a particular word.
        let wordFontStyle: PdfFontStyle = textWord.fontStyle;
        // Gets the size of the font used for a particular word.
        let wordFontSize: number = textWord.fontSize;
    });
});
// Save the document
document.save('output.pdf');
// Destroy the document
document.destroy();
// Load an existing PDF document
var document = new ej.pdf.PdfDocument(data);
// Initialize a new instance of the `PdfDataExtractor` class
var extractor = new ej.pdfdataextract.PdfDataExtractor(document);
// Extract `TextLine` from the PDF document.
var textLines = extractor.extractTextLines({ startPageIndex: 0, endPageIndex: document.pageCount-1});
textLines.forEach((textLine) => {
    textLine.words.forEach((textWord) => {
        // Gets the bounds of the text word.
        var wordBounds = textWord.bounds;
        // Gets the single word of extracted text from the PDF page.
        var word = textWord.text;
        // Gets the collection of text glyphs extracted from a specified page in a PDF document.
        var glyphs = textWord.glyphs;
        // Gets the name of the font used for a particular word.
        var wordFontName = textWord.fontName;
        // Gets the style of the font used for a particular word.
        var wordFontStyle = textWord.fontStyle;
        // Gets the size of the font used for a particular word.
        var wordFontSize = textWord.fontSize;
    });
});
// Save the document
document.save('output.pdf');
// Destroy the document
document.destroy();

Working with characters

You can retrieve a single character and its properties, including bounds, font name, font size, and text color, using the extractTextLines method. Refer to the code sample below.

TypeScript
JavaScript
import { PdfDocument } from '@syncfusion/ej2-pdf';
import { PdfDataExtractor, TextLine, TextWord, PdfFontStyle } from '@syncfusion/ej2-pdf-data-extract';

// Load an existing PDF document
let document: PdfDocument = new PdfDocument(data);
// Initialize a new instance of the `PdfDataExtractor` class
let extractor: PdfDataExtractor = new PdfDataExtractor(document);
// Extract `TextLine` from the PDF document.
let textLines: Array<TextLine> = extractor.extractTextLines({ startPageIndex: 0, endPageIndex: document.pageCount-1});
textLines.forEach((textLine: TextLine) => {
    textLine.words.forEach((textWord: TextWord) => {
        textWord.glyphs.forEach((textGlyph: TextGlyph) => {
            // Gets the bounds of the text glyph
            let glyphBounds: Rectangle = textGlyph.bounds;
            // Gets the single character of extracted text from the PDF page.
            let character: string = textGlyph.text;
            // Gets the font size used for a particular character of the text.
            let fontSize: number = textGlyph.fontSize;
            // Gets the name of the font used for a particular character of the text.
            let fontName: string = textGlyph.fontName;
            // Gets the font style used for a particular character of the text.
            let fontStyle: PdfFontStyle = textGlyph.fontStyle;
            // Gets the text color of the text glyph.
            let color: PdfColor = textGlyph.color;
            // Gets the value indicating whether the glyph is rotated or not.
            let isRotated: boolean = textGlyph.isRotated;
        });
  });
});
// Save the document
document.save('output.pdf');
// Destroy the document
document.destroy();
// Load an existing PDF document
var document = new ej.pdf.PdfDocument(data);
// Initialize a new instance of the `PdfDataExtractor` class
var extractor = new ej.pdfdataextract.PdfDataExtractor(document);
// Extract `TextLine` from the PDF document.
var textLines = extractor.extractTextLines({ startPageIndex: 0, endPageIndex: document.pageCount-1});
textLines.forEach((textLine) => {
    textLine.words.forEach((textWord) => {
        textWord.glyphs.forEach((textGlyph) => {
            // Gets the bounds of the text glyph
            var glyphBounds = textGlyph.bounds;
            // Gets the single character of extracted text from the PDF page.
            var character = textGlyph.text;
            // Gets the font size used for a particular character of the text.
            var fontSize = textGlyph.fontSize;
            // Gets the name of the font used for a particular character of the text.
            var fontName = textGlyph.fontName;
            // Gets the font style used for a particular character of the text.
            var fontStyle = textGlyph.fontStyle;
            // Gets the text color of the text glyph.
            var color = textGlyph.color;
            // Gets the value indicating whether the glyph is rotated or not.
            var isRotated = textGlyph.isRotated;
        });
  });
});
// Save the document
document.save('output.pdf');
// Destroy the document
document.destroy();

Search docs

Ask HelpBot