Convert documents for RAG pipelines

Convert documents for RAG pipelines

Retrieval-augmented generation (RAG) systems need documents in a clean, structured text format for chunking and embedding. Markdown is ideal — it preserves document structure (headings, lists, tables) while being easy to parse.

flowchart LR
    A["PDF / DOCX / XLSX"]
    B["GroupDocs.Markdown"]
    C["Markdown"]
    D["Text Chunking"]
    E["Vector Embeddings"]
    F["LLM Query"]
    A --> B --> C --> D --> E --> F

Basic conversion for RAG

using GroupDocs.Markdown;

// Convert document to Markdown — skip images for text-only RAG
var options = new ConvertOptions
{
    ImageExportStrategy = new SkipImagesStrategy(),
    Flavor = MarkdownFlavor.CommonMark
};

string markdown = MarkdownConverter.ToMarkdown("knowledge-base.pdf", options);

// Split into chunks by headings
string[] chunks = markdown.Split(
    new[] { "\n## ", "\n# " },
    StringSplitOptions.RemoveEmptyEntries);

foreach (string chunk in chunks)
{
    // Send each chunk to your embedding model
    Console.WriteLine($"Chunk ({chunk.Length} chars): {chunk.Substring(0, Math.Min(80, chunk.Length))}...");
}

Batch processing a document library

using GroupDocs.Markdown;

var options = new ConvertOptions
{
    ImageExportStrategy = new SkipImagesStrategy()
};

string[] files = Directory.GetFiles("documents", "*.pdf");

foreach (string file in files)
{
    try
    {
        string markdown = MarkdownConverter.ToMarkdown(file, options);
        string outputPath = Path.ChangeExtension(file, ".md");
        File.WriteAllText(outputPath, markdown);
        Console.WriteLine($"Converted: {file}");
    }
    catch (GroupDocsMarkdownException ex)
    {
        Console.WriteLine($"Skipped {file}: {ex.Message}");
    }
}