Extract images from document

GroupDocs.Parser provides functionality to extract images from various document formats including PDF, Word, Excel, PowerPoint, and more.

Prerequisites

  • GroupDocs.Parser for Python via .NET installed
  • Sample documents containing images
  • Write access to save extracted images (optional)

Extract images from document

To extract all images from a document:

from groupdocs.parser import Parser

# Create an instance of Parser class
with Parser("./sample.pdf") as parser:
    # Extract images
    images = parser.get_images()
    
    # Check if image extraction is supported
    if images is None:
        print("Image extraction isn't supported")
    else:
        # Iterate over images
        for idx, image in enumerate(images):
            # Print image information
            print(f"Image {idx + 1}:")
            print(f"  Page: {image.page.index + 1}")
            print(f"  Type: {image.file_type}")
            print(f"  Size: {image.rectangle.width}x{image.rectangle.height}")
            print(f"  Position: ({image.rectangle.left}, {image.rectangle.top})")

The following sample file is used in this example: sample.pdf

Expected behavior: Returns a collection of PageImageArea objects representing all images found in the document, or None if image extraction is not supported.

Save extracted images to files

To save extracted images to disk:

from groupdocs.parser import Parser
import os

# Create output directory
output_dir = "extracted_images"
os.makedirs(output_dir, exist_ok=True)

# Create an instance of Parser class
with Parser("./sample.docx") as parser:
    # Extract images
    images = parser.get_images()
    
    if images is None:
        print("Image extraction isn't supported")
    else:
        # Iterate over images and save them
        for idx, image in enumerate(images):
            # Get file extension based on image type
            extension = image.file_type.extension
            
            # Generate filename
            filename = f"image_{idx + 1}{extension}"
            filepath = os.path.join(output_dir, filename)
            
            # Save image to file
            image.save(filepath)
            print(f"Saved: {filepath}")

The following sample file is used in this example: sample.docx

Expected behavior: Saves each extracted image to a separate file with the appropriate file extension (.png, .jpg, .gif, etc.).

Extract images with metadata

To extract images along with detailed metadata:

from groupdocs.parser import Parser

# Create an instance of Parser class
with Parser("./sample.pptx") as parser:
    # Check if image extraction is supported
    if not parser.features.images:
        print("Document doesn't support image extraction")
        return
    
    # Extract images
    images = parser.get_images()
    
    if images:
        print(f"Found {len(list(images))} images
")
        
        images = parser.get_images()  # Re-extract as iterator was consumed
        for idx, image in enumerate(images):
            print(f"Image {idx + 1}:")
            print(f"  Page: {image.page.index + 1}")
            print(f"  Format: {image.file_type}")
            print(f"  Rotation: {image.rotation}°")
            print(f"  Rectangle: {image.rectangle}")
            print(f"  Width: {image.rectangle.width}")
            print(f"  Height: {image.rectangle.height}")
            print()

The following sample file is used in this example: sample.pptx

Expected behavior: Displays comprehensive information about each image including position, size, format, and rotation angle.

Get image stream

To work with image data as a stream:

from groupdocs.parser import Parser
from groupdocs.parser.options import ImageOptions, ImageFormat

# Create an instance of Parser class
with Parser("./sample.pdf") as parser:
    # Extract images
    images = parser.get_images()
    
    if images:
        for idx, image in enumerate(images):
            # Get image stream
            image_stream = image.get_image_stream()
            
            # Read image data
            image_data = image_stream.read()
            print(f"Image {idx + 1}: {len(image_data)} bytes, format: {image.file_type}")
            
            # Optionally convert to PNG
            png_options = ImageOptions(ImageFormat.PNG)
            png_stream = image.get_image_stream(png_options)
            png_data = png_stream.read()
            print(f"  Converted to PNG: {len(png_data)} bytes")

The following sample file is used in this example: sample.pdf

The following sample file is used in this example: ImageFormat.PNG

Expected behavior: Provides access to raw image data as a stream, with optional format conversion.

Convert images during extraction

To convert images to a specific format during extraction:

from groupdocs.parser import Parser
from groupdocs.parser.options import ImageOptions, ImageFormat
import os

# Create output directory
output_dir = "converted_images"
os.makedirs(output_dir, exist_ok=True)

# Create an instance of Parser class
with Parser("./sample.pdf") as parser:
    # Extract images
    images = parser.get_images()
    
    if images:
        # Create image options for PNG format
        png_options = ImageOptions(ImageFormat.PNG)
        
        for idx, image in enumerate(images):
            # Save image as PNG (regardless of original format)
            filename = f"image_{idx + 1}.png"
            filepath = os.path.join(output_dir, filename)
            
            # Save with conversion
            image.save(filepath, png_options)
            print(f"Saved as PNG: {filepath}")

The following sample file is used in this example: sample.pdf

The following sample file is used in this example: ImageFormat.PNG

Expected behavior: All extracted images are converted to PNG format before saving, regardless of their original format.

Batch image extraction

Extract images from multiple documents:

from groupdocs.parser import Parser
import os
from pathlib import Path

def extract_images_from_directory(input_dir, output_dir):
    """
    Extract images from all documents in a directory.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Supported document extensions
    extensions = ['.pdf', '.docx', '.doc', '.xlsx', '.pptx', '.ppt']
    
    for file_path in Path(input_dir).rglob('*'):
        if file_path.suffix.lower() in extensions:
            print(f"
Processing: {file_path.name}")
            
            try:
                with Parser(str(file_path)) as parser:
                    images = parser.get_images()
                    
                    if images is None:
                        print(f"  Image extraction not supported")
                        continue
                    
                    # Create subdirectory for this document
                    doc_output_dir = os.path.join(output_dir, file_path.stem)
                    os.makedirs(doc_output_dir, exist_ok=True)
                    
                    # Save images
                    image_count = 0
                    for idx, image in enumerate(images):
                        filename = f"image_{idx + 1}{image.file_type.extension}"
                        filepath = os.path.join(doc_output_dir, filename)
                        image.save(filepath)
                        image_count += 1
                    
                    print(f"  Extracted {image_count} images")
                    
            except Exception as e:
                print(f"  Error: {e}")

# Usage
extract_images_from_directory("input_documents", "extracted_images")

Notes

  • The get_images() method returns None if image extraction is not supported for the document format
  • Always check parser.features.images before attempting to extract images
  • Supported output formats: BMP, GIF, JPEG, PNG, WebP
  • Images are extracted with their original quality and resolution
  • The rotation property indicates the image rotation angle (0, 90, 180, or 270 degrees)
  • Use get_image_stream() for memory-efficient processing of large images