Migration Notes Leave feedback

Why To Migrate?

Here are the key reasons to use the new updated API provided by GroupDocs.Parser for Java since version 19.11:

Parser class is introduced as a single entry point to extract data from the document.
Data extraction was unified for all data types.
The overall document related classes were unified to common.
Product architecture was redesigned from scratch in order to simplify passing options and classes to manipulate data.
Document information and preview generation procedures were simplified.

How To Migrate?

Here is brief comparison of how to extract data using the old and new API.

Text

Old coding style

// Create an extractor factory
ExtractorFactory factory = new ExtractorFactory();
// Create a text extractor
try (TextExtractor extractor = factory.createTextExtractor(filePath)) {
    // Extract a text from the text extractor
    String textLine = null;
    do {
        textLine = extractor.extractLine();
        if (textLine != null) {
            System.out.println(textLine);
        }
    }
    while (textLine != null);
}

New coding style

// Create an instance of Parser class
try (Parser parser = new Parser(filePath)) {
    // Extract a text to the reader
    try (TextReader reader = parser.getText()) {
        // Check if text extraction is supported
        if (reader == null) {
            System.out.println("Text extraction isn't supported.");
            return;
        }
        // Extract a text from the reader
        String textLine = null;
        do {
            textLine = reader.readLine();
            if (textLine != null) {
                System.out.println(textLine);
            }
        }
        while (textLine != null);
    }
}

Text Page

Old coding style

// Create an extractor factory
ExtractorFactory factory = new ExtractorFactory();
// Create a text extractor
try (TextExtractor extractor = factory.createTextExtractor(filePath)) {
    // Check if the extractor supports pagination
    IPageTextExtractor pte = extractor instanceof IPageTextExtractor
            ? (IPageTextExtractor) extractor
            : null;
    if (pte != null) {
        // Extract the first page
        System.out.println(pte.extractPage(0));
    }
}

New coding style

// Create an instance of Parser class
try (Parser parser = new Parser(filePath)) {
    // Extract the first page text to the reader
    try (TextReader reader = parser.getText(0)) {
        // Check if text extraction is supported
        if (reader != null) {
            // Extract a text from the reader
            System.out.println(reader.readToEnd());
        }
    }
}

Search

Old coding style

// Create an extractor factory
ExtractorFactory factory = new ExtractorFactory();
// Create a text extractor
try (TextExtractor extractor = factory.createTextExtractor(filePath)) {
    // Check if the extractor supports search
    ISearchable se = extractor instanceof ISearchable
            ? (ISearchable) extractor
            : null;
    if (se != null) {
        // Create a handler
        ListSearchHandler handler = new ListSearchHandler();
        // Search "keyword" in the document
        se.search(new SearchOptions(null), handler, java.util.Arrays.asList(new String[]{"keyword"}));
        // Print search results
        for (SearchResult result : handler.getList()) {
            System.out.println(String.format("at %d: %s", result.getIndex(), result.getFoundText()));
        }
    }
}

New coding style

// Create an instance of Parser class
try (Parser parser = new Parser(filePath)) {
    // Search "keyword" in the document
    Iterable<SearchResult> list = parser.search("keyword");
    // Check if search is supported
    if (list == null) {
        System.out.println("Search isn't supported.");
        return;
    }
    // Print search results
    for (SearchResult result : list) {
        System.out.println(String.format("at %d: %s", result.getPosition(), result.getText()));
    }
}

File Type Detection

Old coding style

// Detect and print file type
System.out.println(CompositeMediaTypeDetector.DEFAULT.detect(filePath));

New coding style

// Create an instance of Parser class
try (Parser parser = new Parser(filePath)) {
    // Detect and print file type
    System.out.println(parser.getDocumentInfo().getFileType());
}

Metadata

Old coding style

// Create an extractor factory
ExtractorFactory factory = new ExtractorFactory();
// Create a metadata extractor
MetadataExtractor extractor = factory.createMetadataExtractor(filePath);
// Extract metadata
MetadataCollection metadata = extractor.extractMetadata(filePath);
// Print metadata
for (String key : metadata.getKeys()) {
    String value = metadata.get_Item(key);
    System.out.println(String.format("%s = %s", key, value));
}

New coding style

// Create an instance of Parser class
try (Parser parser = new Parser(filePath)) {
    // Extract metadata
    Iterable<MetadataItem> metadata = parser.getMetadata();
    // Check if metadata extraction is supported
    if (metadata == null) {
        System.out.println("Metadata extraction isn't supported.");
        return;
    }
    // Print metadata
    for (MetadataItem item : metadata) {
        System.out.println(String.format("%s = %s", item.getName(), item.getValue()));
    }
}

Structure

Old coding style

// Create an extractor factory
ExtractorFactory factory = new ExtractorFactory();
// Create a text extractor
try (TextExtractor extractor = factory.createTextExtractor(filePath)) {
    // Check if the extractor supports text structure extraction
    IStructuredExtractor se = extractor instanceof IStructuredExtractor
            ? (IStructuredExtractor) extractor
            : null;
    if (se != null) {
        // Create a handler
        Handler handler = new Handler();
        // Extract text structure
        se.extractStructured(handler);
        // Print hyperlinks
        for (String link : handler.getLinks()) {
            System.out.println(link);
        }
    }
}

// Handler for the hyperlink extraction
class Handler extends StructuredHandler {
    private final java.util.List<String> links;
    public Handler() {
        links = new java.util.ArrayList<String>();
    }
    public java.util.List<String> getLinks() {
        return links;
    }
    // Override the method to catch hyperlinks
    @Override
    protected void onStartHyperlink(HyperlinkProperties properties) {
        links.add(properties.getLink());
    }
}

New coding style

// Create an instance of Parser class
try (Parser parser = new Parser(filePath)) {
    // Extract text structure to the XML reader
    Document document = parser.getStructure();
    // Check if text structure extraction is supported
    if (document == null) {
        System.out.println("Text structure extraction isn't supported.");
        return;
    }
    // Read XML document
    readNode(document.getDocumentElement());
}

void readNode(Node node) {
    NodeList nodes = node.getChildNodes();
    for (int i = 0; i < nodes.getLength(); i++) {
        Node n = nodes.item(i);
        if (n.getNodeName().toLowerCase() == "hyperlink") {
            Node a = n.getAttributes().getNamedItem("link");
            if (a != null) {
                System.out.println(a.getNodeValue());
            }
        }
        if (n.hasChildNodes()) {
            readNode(n);
        }
    }
}

We value your opinion. Your feedback will help us improve our documentation.

Migration Notes Leave feedback

Why To Migrate?

How To Migrate?

Text

Text Page

Search

File Type Detection

Metadata

Structure

Was this page helpful?

Any additional feedback you'd like to share with us?

Please tell us how we can improve this page.

Thank you for your feedback!