Skip to content

Instantly share code, notes, and snippets.

@marcominerva
Last active April 15, 2025 08:11
Show Gist options
  • Save marcominerva/e14d4a9a3b702c990dc34d4a6013b64c to your computer and use it in GitHub Desktop.
Save marcominerva/e14d4a9a3b702c990dc34d4a6013b64c to your computer and use it in GitHub Desktop.
Mistral OCR - Get markdown from PDF
using System.Net.Http.Json;
using System.Text.Json;
using System.Text.Json.Serialization;
var endpoint = "https://<endpoint>.<region>.models.ai.azure.com/";
var apiKey = "";
var inputFilePath = @""; // The path of the source PDF
var outputFilePath = @""; // The path of the destination Markdown file
var base64Content = Convert.ToBase64String(await File.ReadAllBytesAsync(inputFilePath));
Console.WriteLine($"Getting the markdown for {inputFilePath}...");
var httpClient = new HttpClient()
{
BaseAddress = new Uri(endpoint),
DefaultRequestHeaders =
{
{ "Authorization", $"Bearer {apiKey}" }
}
};
var content = new
{
model = "mistral-ocr-2503",
document = new
{
type = "document_url",
document_url = $"data:application/pdf;base64,{base64Content}"
}
};
var jsonContent = JsonContent.Create(content, options: JsonSerializerOptions.Web);
await jsonContent.LoadIntoBufferAsync();
using var response = await httpClient.PostAsync($"v1/ocr", jsonContent);
Console.WriteLine($"Response status code: {response.StatusCode}");
if (response.IsSuccessStatusCode)
{
var ocrResponse = await response.Content.ReadFromJsonAsync<OcrResponse>();
Console.WriteLine();
foreach (var page in ocrResponse!.Pages)
{
Console.WriteLine(page.Markdown);
}
Console.WriteLine();
Console.WriteLine($"Pages processed: {ocrResponse.UsageInfo.PagesProcessed}");
Console.WriteLine($"Document size: {ocrResponse.UsageInfo.DocumentSize} bytes");
Console.WriteLine();
await File.AppendAllLinesAsync(outputFilePath, ocrResponse.Pages.Select(p => p.Markdown));
Console.WriteLine($"The file {outputFilePath} has been saved.");
}
else
{
Console.WriteLine($"Error: {await response.Content.ReadAsStringAsync()}");
}
public record class OcrResponse(IEnumerable<OcrPage> Pages, [property: JsonPropertyName("usage_info")] OcrUsageInfo UsageInfo);
public record class OcrPage(string Markdown);
public record class OcrUsageInfo([property: JsonPropertyName("pages_processed")] int PagesProcessed,
[property: JsonPropertyName("doc_size_bytes")] int DocumentSize);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment