Last active
April 15, 2025 08:11
-
-
Save marcominerva/e14d4a9a3b702c990dc34d4a6013b64c to your computer and use it in GitHub Desktop.
Mistral OCR - Get markdown from PDF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Net.Http.Json; | |
using System.Text.Json; | |
using System.Text.Json.Serialization; | |
var endpoint = "https://<endpoint>.<region>.models.ai.azure.com/"; | |
var apiKey = ""; | |
var inputFilePath = @""; // The path of the source PDF | |
var outputFilePath = @""; // The path of the destination Markdown file | |
var base64Content = Convert.ToBase64String(await File.ReadAllBytesAsync(inputFilePath)); | |
Console.WriteLine($"Getting the markdown for {inputFilePath}..."); | |
var httpClient = new HttpClient() | |
{ | |
BaseAddress = new Uri(endpoint), | |
DefaultRequestHeaders = | |
{ | |
{ "Authorization", $"Bearer {apiKey}" } | |
} | |
}; | |
var content = new | |
{ | |
model = "mistral-ocr-2503", | |
document = new | |
{ | |
type = "document_url", | |
document_url = $"data:application/pdf;base64,{base64Content}" | |
} | |
}; | |
var jsonContent = JsonContent.Create(content, options: JsonSerializerOptions.Web); | |
await jsonContent.LoadIntoBufferAsync(); | |
using var response = await httpClient.PostAsync($"v1/ocr", jsonContent); | |
Console.WriteLine($"Response status code: {response.StatusCode}"); | |
if (response.IsSuccessStatusCode) | |
{ | |
var ocrResponse = await response.Content.ReadFromJsonAsync<OcrResponse>(); | |
Console.WriteLine(); | |
foreach (var page in ocrResponse!.Pages) | |
{ | |
Console.WriteLine(page.Markdown); | |
} | |
Console.WriteLine(); | |
Console.WriteLine($"Pages processed: {ocrResponse.UsageInfo.PagesProcessed}"); | |
Console.WriteLine($"Document size: {ocrResponse.UsageInfo.DocumentSize} bytes"); | |
Console.WriteLine(); | |
await File.AppendAllLinesAsync(outputFilePath, ocrResponse.Pages.Select(p => p.Markdown)); | |
Console.WriteLine($"The file {outputFilePath} has been saved."); | |
} | |
else | |
{ | |
Console.WriteLine($"Error: {await response.Content.ReadAsStringAsync()}"); | |
} | |
public record class OcrResponse(IEnumerable<OcrPage> Pages, [property: JsonPropertyName("usage_info")] OcrUsageInfo UsageInfo); | |
public record class OcrPage(string Markdown); | |
public record class OcrUsageInfo([property: JsonPropertyName("pages_processed")] int PagesProcessed, | |
[property: JsonPropertyName("doc_size_bytes")] int DocumentSize); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment