Skip to content

Instantly share code, notes, and snippets.

@jdmichaud
Last active June 21, 2025 06:06
Show Gist options
  • Save jdmichaud/794fb2fdfcad4a2306da4cdee51d7b8c to your computer and use it in GitHub Desktop.
Save jdmichaud/794fb2fdfcad4a2306da4cdee51d7b8c to your computer and use it in GitHub Desktop.
Markdown parser
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <ctype.h>
#define MAX_LINE_LENGTH 4096 // Max line length for reading input (can be adjusted)
#define INITIAL_CHILDREN_CAPACITY 4
#define TAB_STOP_WIDTH 4
// --- Enums and Structs ---
typedef enum {
MD_NODE_DOCUMENT,
MD_NODE_PARAGRAPH,
MD_NODE_HEADING,
MD_NODE_BLOCKQUOTE,
MD_NODE_UNORDERED_LIST,
MD_NODE_ORDERED_LIST,
MD_NODE_LIST_ITEM,
MD_NODE_CODE_BLOCK, // Indented code block
MD_NODE_FENCED_CODE_BLOCK,
MD_NODE_HORIZONTAL_RULE,
MD_NODE_HTML_BLOCK, // Basic support
// Inline elements
MD_NODE_TEXT,
MD_NODE_EMPHASIS,
MD_NODE_STRONG,
MD_NODE_STRIKETHROUGH,
MD_NODE_INLINE_CODE,
MD_NODE_LINK,
MD_NODE_IMAGE,
MD_NODE_SOFT_BREAK,
MD_NODE_HARD_BREAK,
MD_NODE_HTML_INLINE, // Basic support
MD_NODE_ESCAPED_CHAR
} MDNodeType;
typedef struct MDNode {
MDNodeType type;
char* text_content; // For TEXT, INLINE_CODE, HTML_INLINE/BLOCK, ESCAPED_CHAR
char* code_language; // For FENCED_CODE_BLOCK
char* code_content; // For CODE_BLOCK, FENCED_CODE_BLOCK
int heading_level; // For HEADING (1-6)
char* url; // For LINK, IMAGE
char* title; // For LINK, IMAGE (optional)
char* alt_text; // For IMAGE
char list_item_marker; // For LIST_ITEM ('*', '-', '+', '.', ')')
int list_start_number; // For ORDERED_LIST (if applicable for the list itself)
bool tight_list; // For lists (true if no blank lines between items)
struct MDNode* parent; // Optional: points to parent node
struct MDNode** children;
size_t children_count;
size_t children_capacity;
} MDNode;
// --- Utility Functions ---
char* md_strdup(const char* s) {
if (!s) return NULL;
size_t len = strlen(s);
char* d = (char*)malloc(len + 1);
if (!d) return NULL;
memcpy(d, s, len + 1);
return d;
}
char* md_strndup(const char* s, size_t n) {
if (!s) return NULL;
char* d = (char*)malloc(n + 1);
if (!d) return NULL;
memcpy(d, s, n);
d[n] = '\0';
return d;
}
const char* trim_leading_whitespace(const char* str) {
while (*str && isspace((unsigned char)*str)) {
str++;
}
return str;
}
char* trim_trailing_whitespace(char* str) {
if (!str || !*str) return str;
char* end = str + strlen(str) - 1;
while (end >= str && isspace((unsigned char)*end)) {
end--;
}
*(end + 1) = '\0';
return str;
}
int count_leading_spaces(const char* line) {
int count = 0;
while (line[count] == ' ') {
count++;
}
return count;
}
bool is_blank_line(const char* line) {
while (*line) {
if (!isspace((unsigned char)*line)) {
return false;
}
line++;
}
return true;
}
char* expand_tabs(const char* line) {
int len = strlen(line);
// Estimate new length: each tab could become TAB_STOP_WIDTH spaces
char* new_line = (char*)malloc(len * TAB_STOP_WIDTH + 1);
if (!new_line) return NULL;
int current_col = 0;
int j = 0; // index for new_line
for (int i = 0; line[i]; ++i) {
if (line[i] == '\t') {
int spaces_to_add = TAB_STOP_WIDTH - (current_col % TAB_STOP_WIDTH);
for (int k = 0; k < spaces_to_add; ++k) {
new_line[j++] = ' ';
}
current_col += spaces_to_add;
} else {
new_line[j++] = line[i];
current_col++;
}
}
new_line[j] = '\0';
char* final_line = (char*)realloc(new_line, j + 1); // Resize to actual
return final_line ? final_line : new_line; // Return original if realloc fails
}
// --- AST Node Management ---
MDNode* md_node_new(MDNodeType type) {
MDNode* node = (MDNode*)calloc(1, sizeof(MDNode));
if (!node) {
perror("Failed to allocate MDNode");
// In a real library, might return NULL and let caller handle.
// For this single file example, exiting is simpler.
exit(EXIT_FAILURE);
}
node->type = type;
return node;
}
void md_node_add_child(MDNode* parent, MDNode* child) {
if (!parent || !child) return;
if (parent->children_count >= parent->children_capacity) {
size_t new_capacity = parent->children_capacity == 0 ? INITIAL_CHILDREN_CAPACITY : parent->children_capacity * 2;
MDNode** new_children = (MDNode**)realloc(parent->children, new_capacity * sizeof(MDNode*));
if (!new_children) {
perror("Failed to reallocate children array");
// This is a critical failure. For simplicity, we might lose the child.
// A robust solution would handle this more gracefully.
return;
}
parent->children = new_children;
parent->children_capacity = new_capacity;
}
parent->children[parent->children_count++] = child;
child->parent = parent; // Set parent pointer
}
void md_node_free(MDNode* node) {
if (!node) return;
free(node->text_content);
free(node->code_language);
free(node->code_content);
free(node->url);
free(node->title);
free(node->alt_text);
for (size_t i = 0; i < node->children_count; ++i) {
md_node_free(node->children[i]);
}
free(node->children);
free(node);
}
// --- Line Buffer for Block Parsing ---
typedef struct {
char** lines;
int count;
int capacity;
} LineBuffer;
void init_line_buffer(LineBuffer* buf) {
buf->lines = NULL;
buf->count = 0;
buf->capacity = 0;
}
void add_line_to_buffer(LineBuffer* buf, const char* line) {
if (buf->count >= buf->capacity) {
buf->capacity = buf->capacity == 0 ? 16 : buf->capacity * 2;
char** new_lines_arr = (char**)realloc(buf->lines, buf->capacity * sizeof(char*));
if (!new_lines_arr) {
perror("Failed to realloc line buffer");
return; // Data loss, but trying to continue
}
buf->lines = new_lines_arr;
}
buf->lines[buf->count++] = md_strdup(line);
}
void free_line_buffer(LineBuffer* buf) {
for (int i = 0; i < buf->count; ++i) {
free(buf->lines[i]);
}
free(buf->lines);
init_line_buffer(buf); // Reset to initial state
}
// --- Forward Declarations for Parsers ---
void parse_inlines_recursive(MDNode* parent_node, const char* text_start, const char* text_end);
void parse_inlines(MDNode* parent_node, const char* text);
MDNode* parse_blocks(const char** lines, int num_lines, int* current_line_index, MDNode* parent_container);
MDNode* parse_horizontal_rule(const char* line);
// --- Inline Parsing ---
const char* find_next_unescaped(const char* s, const char* text_end, char c) {
while (s < text_end) {
if (*s == '\\' && (s + 1) < text_end && *(s + 1) == c) {
s += 2;
} else if (*s == c) {
return s;
} else {
s++;
}
}
return NULL;
}
const char* find_matching_delimiter(const char* start, const char* text_end, const char* marker, int marker_len) {
const char* p = start + marker_len;
while (p <= text_end - marker_len) { // Ensure space for marker
if (strncmp(p, marker, marker_len) == 0) {
if (p > start && *(p - 1) == '\\') { // Escaped
p += marker_len;
continue;
}
return p;
}
p++;
}
return NULL;
}
void parse_inlines_recursive(MDNode* parent_node, const char* text_start, const char* text_end) {
if (text_start >= text_end) return;
const char* p = text_start;
const char* current_segment_start = text_start;
while (p < text_end) {
MDNode* inline_node = NULL;
const char* next_p = p; // Store where p should jump to
// 1. Escaped characters
if (*p == '\\' && (p + 1) < text_end && strchr("*_`~[]()#+-.<>!", *(p + 1))) {
if (p > current_segment_start) {
MDNode* text_node = md_node_new(MD_NODE_TEXT);
text_node->text_content = md_strndup(current_segment_start, p - current_segment_start);
md_node_add_child(parent_node, text_node);
}
inline_node = md_node_new(MD_NODE_ESCAPED_CHAR);
inline_node->text_content = md_strndup(p + 1, 1);
md_node_add_child(parent_node, inline_node);
next_p = p + 2;
}
// 2. Images: ![alt](url "title")
else if (*p == '!' && (p + 1) < text_end && *(p + 1) == '[') {
const char* alt_text_start = p + 2;
const char* alt_text_end = find_next_unescaped(alt_text_start, text_end, ']');
if (alt_text_end && (alt_text_end + 1) < text_end && *(alt_text_end + 1) == '(') {
const char* url_start = alt_text_end + 2;
const char* url_end_search = url_start;
int paren_balance = 1;
while(url_end_search < text_end && paren_balance > 0) {
if (*url_end_search == '\\' && (url_end_search+1) < text_end) { url_end_search += 2; continue; }
if (*url_end_search == '(') paren_balance++;
else if (*url_end_search == ')') paren_balance--;
if (paren_balance == 0) break;
url_end_search++;
}
if (url_end_search < text_end && *url_end_search == ')') {
if (p > current_segment_start) {
MDNode* text_node = md_node_new(MD_NODE_TEXT);
text_node->text_content = md_strndup(current_segment_start, p - current_segment_start);
md_node_add_child(parent_node, text_node);
}
inline_node = md_node_new(MD_NODE_IMAGE);
inline_node->alt_text = md_strndup(alt_text_start, alt_text_end - alt_text_start);
const char* link_content_ptr = url_start;
const char* actual_url_end = url_end_search;
const char* title_start_ptr = NULL;
// Check for title within (url "title")
// Title must be at the end, enclosed in " or ' or ()
// Simplified: look for space then " or ' for title.
const char* temp_title_search = url_start;
while(temp_title_search < url_end_search) {
if (isspace((unsigned char)*temp_title_search) && (temp_title_search + 1) < url_end_search) {
char quote_char = *(temp_title_search + 1);
if (quote_char == '"' || quote_char == '\'') {
const char* t_start = temp_title_search + 2;
const char* t_end = t_start;
while(t_end < url_end_search && *t_end != quote_char) {
if (*t_end == '\\' && (t_end+1) < url_end_search) t_end++;
t_end++;
}
if (t_end < url_end_search && *t_end == quote_char) { // Found title
inline_node->title = md_strndup(t_start, t_end - t_start);
actual_url_end = temp_title_search; // URL ends before space leading to title
break;
}
}
}
temp_title_search++;
}
char* temp_url = md_strndup(link_content_ptr, actual_url_end - link_content_ptr);
inline_node->url = md_strdup(trim_trailing_whitespace(temp_url)); // Also trim leading just in case
free(temp_url);
md_node_add_child(parent_node, inline_node);
next_p = url_end_search + 1;
}
}
}
// 3. Links: [text](url "title")
else if (*p == '[') {
const char* text_s = p + 1;
const char* text_e = find_next_unescaped(text_s, text_end, ']');
if (text_e && (text_e + 1) < text_end && *(text_e + 1) == '(') {
const char* url_s = text_e + 2;
const char* url_e_search = url_s;
int bal = 1;
while(url_e_search < text_end && bal > 0) {
if (*url_e_search == '\\' && (url_e_search+1) < text_end) { url_e_search += 2; continue; }
if (*url_e_search == '(') bal++;
else if (*url_e_search == ')') bal--;
if (bal == 0) break;
url_e_search++;
}
if (url_e_search < text_end && *url_e_search == ')') {
if (p > current_segment_start) {
MDNode* text_node = md_node_new(MD_NODE_TEXT);
text_node->text_content = md_strndup(current_segment_start, p - current_segment_start);
md_node_add_child(parent_node, text_node);
}
inline_node = md_node_new(MD_NODE_LINK);
const char* link_content_ptr = url_s;
const char* actual_url_end = url_e_search;
// Title parsing logic (same as for image)
const char* temp_title_search = url_s;
while(temp_title_search < url_e_search) {
if (isspace((unsigned char)*temp_title_search) && (temp_title_search + 1) < url_e_search) {
char quote_char = *(temp_title_search + 1);
if (quote_char == '"' || quote_char == '\'') {
const char* t_start = temp_title_search + 2;
const char* t_end = t_start;
while(t_end < url_e_search && *t_end != quote_char) {
if (*t_end == '\\' && (t_end+1) < url_e_search) t_end++;
t_end++;
}
if (t_end < url_e_search && *t_end == quote_char) {
inline_node->title = md_strndup(t_start, t_end - t_start);
actual_url_end = temp_title_search;
break;
}
}
}
temp_title_search++;
}
char* temp_url = md_strndup(link_content_ptr, actual_url_end - link_content_ptr);
inline_node->url = md_strdup(trim_trailing_whitespace(temp_url));
free(temp_url);
md_node_add_child(parent_node, inline_node);
parse_inlines_recursive(inline_node, text_s, text_e); // Parse link text
next_p = url_e_search + 1;
}
}
}
// Strong (**, __)
else if ((*p == '*' && (p + 1) < text_end && *(p + 1) == '*') ||
(*p == '_' && (p + 1) < text_end && *(p + 1) == '_')) {
char marker[3]; marker[0] = *p; marker[1] = *p; marker[2] = '\0';
const char* end_marker = find_matching_delimiter(p, text_end, marker, 2);
if (end_marker) {
if (p > current_segment_start) {
MDNode* text_node = md_node_new(MD_NODE_TEXT);
text_node->text_content = md_strndup(current_segment_start, p - current_segment_start);
md_node_add_child(parent_node, text_node);
}
inline_node = md_node_new(MD_NODE_STRONG);
md_node_add_child(parent_node, inline_node);
parse_inlines_recursive(inline_node, p + 2, end_marker);
next_p = end_marker + 2;
}
}
// Emphasis (*, _)
else if (*p == '*' || *p == '_') {
char marker[2]; marker[0] = *p; marker[1] = '\0';
// Basic emphasis, not considering complex CommonMark intra-word rules for '_'
const char* end_marker = find_matching_delimiter(p, text_end, marker, 1);
if (end_marker && end_marker > p + 1) { // Not empty, e.g. **, __ or *p
// Avoid triggering on internal underscores in words if marker is '_' (simplified)
bool allow = true;
if (*p == '_' && end_marker + 1 < text_end && isalnum((unsigned char)*(end_marker+1)) && isalnum((unsigned char)*(p-1))) {
// allow = false; // crude attempt to prevent word_emphasis_word
}
if(allow) {
if (p > current_segment_start) {
MDNode* text_node = md_node_new(MD_NODE_TEXT);
text_node->text_content = md_strndup(current_segment_start, p - current_segment_start);
md_node_add_child(parent_node, text_node);
}
inline_node = md_node_new(MD_NODE_EMPHASIS);
md_node_add_child(parent_node, inline_node);
parse_inlines_recursive(inline_node, p + 1, end_marker);
next_p = end_marker + 1;
}
}
}
// Strikethrough (~~)
else if (*p == '~' && (p + 1) < text_end && *(p + 1) == '~') {
const char* end_marker = find_matching_delimiter(p, text_end, "~~", 2);
if (end_marker) {
if (p > current_segment_start) {
MDNode* text_node = md_node_new(MD_NODE_TEXT);
text_node->text_content = md_strndup(current_segment_start, p - current_segment_start);
md_node_add_child(parent_node, text_node);
}
inline_node = md_node_new(MD_NODE_STRIKETHROUGH);
md_node_add_child(parent_node, inline_node);
parse_inlines_recursive(inline_node, p + 2, end_marker);
next_p = end_marker + 2;
}
}
// Inline Code (`) - simplified, doesn't handle `` code ``, etc.
else if (*p == '`') {
const char* end_marker = find_next_unescaped(p + 1, text_end, '`');
if (end_marker) {
if (p > current_segment_start) {
MDNode* text_node = md_node_new(MD_NODE_TEXT);
text_node->text_content = md_strndup(current_segment_start, p - current_segment_start);
md_node_add_child(parent_node, text_node);
}
inline_node = md_node_new(MD_NODE_INLINE_CODE);
const char* code_s = p + 1;
const char* code_e = end_marker;
size_t code_len = code_e - code_s;
// Trim one leading/trailing space if content is not all spaces and starts/ends with space
if (code_len >= 2 && *code_s == ' ' && *(code_e - 1) == ' ') {
bool all_spaces = true;
for(size_t i=0; i<code_len; ++i) if (code_s[i] != ' ') { all_spaces = false; break;}
if (!all_spaces) {
code_s++; code_len -=2;
}
}
inline_node->text_content = md_strndup(code_s, code_len);
md_node_add_child(parent_node, inline_node);
next_p = end_marker + 1;
}
}
// Autolinks <http://...> or <mailto:...> and basic HTML tags
else if (*p == '<') {
bool is_autolink_uri = false;
if ((text_end - (p+1)) >= 7 && (strncmp(p+1, "http://", 7) == 0 || strncmp(p+1, "https://", 8) == 0 || strncmp(p+1, "mailto:", 7) == 0)) {
is_autolink_uri = true;
} else if ((text_end - (p+1)) >= 6 && strncmp(p+1, "ftp://", 6) == 0) {
is_autolink_uri = true;
}
if (is_autolink_uri) {
const char* end_autolink = find_next_unescaped(p + 1, text_end, '>');
if (end_autolink) {
if (p > current_segment_start) {
MDNode* text_node = md_node_new(MD_NODE_TEXT);
text_node->text_content = md_strndup(current_segment_start, p - current_segment_start);
md_node_add_child(parent_node, text_node);
}
inline_node = md_node_new(MD_NODE_LINK);
inline_node->url = md_strndup(p + 1, end_autolink - (p + 1));
MDNode* text_child = md_node_new(MD_NODE_TEXT); // Autolink text is its URL
text_child->text_content = md_strdup(inline_node->url);
md_node_add_child(inline_node, text_child);
md_node_add_child(parent_node, inline_node);
next_p = end_autolink + 1;
}
} else if ((p+1) < text_end && (isalpha((unsigned char)*(p+1)) || *(p+1) == '/' || *(p+1) == '!')) { // Basic HTML tag start
const char* end_tag = strchr(p + 1, '>'); // Simple search for closing >
if (end_tag && end_tag < text_end) {
if (p > current_segment_start) {
MDNode* text_node = md_node_new(MD_NODE_TEXT);
text_node->text_content = md_strndup(current_segment_start, p - current_segment_start);
md_node_add_child(parent_node, text_node);
}
inline_node = md_node_new(MD_NODE_HTML_INLINE);
inline_node->text_content = md_strndup(p, (end_tag - p) + 1);
md_node_add_child(parent_node, inline_node);
next_p = end_tag + 1;
}
}
}
if (inline_node) { // An inline element was parsed
p = next_p;
current_segment_start = p;
} else { // No inline element started at *p, advance
p++;
}
}
// Add any remaining text as a plain text node
if (p > current_segment_start) {
MDNode* text_node = md_node_new(MD_NODE_TEXT);
text_node->text_content = md_strndup(current_segment_start, p - current_segment_start);
md_node_add_child(parent_node, text_node);
}
}
void parse_inlines(MDNode* parent_block_node, const char* text) {
if (!text || !*text) return;
parse_inlines_recursive(parent_block_node, text, text + strlen(text));
}
// --- Block Parsing ---
char is_list_item_line(const char* line, int* out_indent, const char** out_content_start, int* out_number_if_ordered, char* out_actual_marker_char) {
const char* p = line;
int current_indent = 0;
while (*p == ' ') {
p++;
current_indent++;
}
if (out_number_if_ordered) *out_number_if_ordered = 0;
const char* marker_start = p;
// Unordered list: *, -, +
if (*p == '*' || *p == '-' || *p == '+') {
p++;
if (*p == ' ' || *p == '\t' || *p == '\0' || (*p == '\n')) { // Check for space or end of line/string after marker
*out_indent = current_indent;
*out_content_start = p;
if (*p != '\0' && *p != '\n') (*out_content_start)++; // Skip one space/tab
while (**out_content_start == ' ' || **out_content_start == '\t') (*out_content_start)++; // Skip multiple spaces/tabs
if(out_actual_marker_char) *out_actual_marker_char = *marker_start;
return *marker_start; // Return the marker character itself
}
}
// Ordered list: 1., 1)
const char* q = p;
long num_val = 0;
char* end_num_ptr;
if (isdigit((unsigned char)*q)) {
num_val = strtol(q, &end_num_ptr, 10);
if (end_num_ptr > q) { // Number was parsed
q = end_num_ptr;
if (*q == '.' || *q == ')') {
p = q + 1;
if (*p == ' ' || *p == '\t' || *p == '\0' || *p == '\n') {
*out_indent = current_indent;
*out_content_start = p;
if (*p != '\0' && *p != '\n') (*out_content_start)++;
while (**out_content_start == ' ' || **out_content_start == '\t') (*out_content_start)++;
if (out_number_if_ordered) *out_number_if_ordered = (int)num_val; // strtol handles large numbers
if(out_actual_marker_char) *out_actual_marker_char = *q;
return *q; // Return '.' or ')'
}
}
}
}
return 0; // Not a list item
}
MDNode* parse_heading_atx(const char* line) {
int level = 0;
const char* p = line; // line is already trimmed of leading spaces by caller
while (*p == '#') {
level++;
p++;
}
if (level == 0 || level > 6) return NULL;
if (*p != ' ' && *p != '\t' && *p != '\0' && *p != '\n') return NULL; // Must have space or be empty after #s
while (*p == ' ' || *p == '\t') p++; // Skip spaces after #s
char* temp_content = md_strdup(p);
char* end = temp_content + strlen(temp_content) - 1;
while (end >= temp_content && (*end == ' ' || *end == '\t' || *end == '#')) {
if (*end == '#') { // Remove trailing '#'s only if they are preceded by space or more '#'s
char* check = end -1;
bool clear_to_remove_hash = (check < temp_content || *check == ' ' || *check == '\t' || *check == '#');
if (clear_to_remove_hash) end--; else break;
} else { // It's a space/tab
end--;
}
}
*(end + 1) = '\0';
MDNode* heading_node = md_node_new(MD_NODE_HEADING);
heading_node->heading_level = level;
parse_inlines(heading_node, temp_content);
free(temp_content);
return heading_node;
}
MDNode* parse_fenced_code_block(const char** lines, int num_lines, int* current_line_index) {
const char* first_line = lines[*current_line_index];
const char* trimmed_first_line = trim_leading_whitespace(first_line);
int line_indent = count_leading_spaces(first_line);
char fence_char = trimmed_first_line[0];
int fence_len = 0;
const char* p = trimmed_first_line;
while (*p == fence_char) {
fence_len++;
p++;
}
if (fence_len < 3) return NULL;
const char* lang_start = p;
while (*lang_start == ' ' || *lang_start == '\t') lang_start++; // Skip spaces before info string
char* lang_info = md_strdup(lang_start);
trim_trailing_whitespace(lang_info);
if (fence_char == '`' && strchr(lang_info, '`')) { // Info string cannot contain backticks for backtick fences
free(lang_info); return NULL;
}
MDNode* code_node = md_node_new(MD_NODE_FENCED_CODE_BLOCK);
if (strlen(lang_info) > 0) {
char* first_word_end = lang_info;
while(*first_word_end && !isspace((unsigned char)*first_word_end)) first_word_end++;
*first_word_end = '\0'; // Take only first word as language
code_node->code_language = md_strdup(lang_info);
}
free(lang_info);
(*current_line_index)++;
LineBuffer content_buffer;
init_line_buffer(&content_buffer);
while (*current_line_index < num_lines) {
const char* current_line_orig = lines[*current_line_index];
const char* current_line_ptr = current_line_orig;
int current_indent = 0;
while(current_indent < line_indent && *current_line_ptr == ' ') { // CommonMark allows indent for closing fence
current_line_ptr++;
current_indent++;
}
const char* q = current_line_ptr;
int closing_fence_len = 0;
while(*q == fence_char) {
closing_fence_len++;
q++;
}
q = trim_leading_whitespace(q);
if (closing_fence_len >= fence_len && is_blank_line(q)) {
(*current_line_index)++;
break;
}
// Remove up to `line_indent` common indent from content lines
int content_line_indent = count_leading_spaces(current_line_orig);
int effective_indent_to_remove = (content_line_indent < line_indent) ? content_line_indent : line_indent;
add_line_to_buffer(&content_buffer, current_line_orig + effective_indent_to_remove);
(*current_line_index)++;
}
size_t total_len = 0;
for (int i = 0; i < content_buffer.count; ++i) {
total_len += strlen(content_buffer.lines[i]) + 1;
}
if (total_len > 0) {
code_node->code_content = (char*)malloc(total_len);
code_node->code_content[0] = '\0';
for (int i = 0; i < content_buffer.count; ++i) {
strcat(code_node->code_content, content_buffer.lines[i]);
if (i < content_buffer.count - 1 || content_buffer.count > 0 ) { // Add \n unless it's the very last line and buffer is not empty
strcat(code_node->code_content, "\n");
}
}
// Trim final newline if content buffer was not empty.
if (content_buffer.count > 0 && code_node->code_content[strlen(code_node->code_content)-1] == '\n') {
code_node->code_content[strlen(code_node->code_content)-1] = '\0';
}
} else {
code_node->code_content = md_strdup("");
}
free_line_buffer(&content_buffer);
return code_node;
}
MDNode* parse_code_block_indented(const char** lines, int num_lines, int* current_line_index) {
MDNode* code_node = md_node_new(MD_NODE_CODE_BLOCK);
LineBuffer content_buffer;
init_line_buffer(&content_buffer);
int consecutive_blank_lines_in_code = 0;
while (*current_line_index < num_lines) {
const char* line = lines[*current_line_index];
if (is_blank_line(line)) {
// A blank line can be part of an indented code block.
// If we've already collected code lines, add this blank line (with minimal indent)
if (content_buffer.count > 0) {
// CommonMark: up to 4 spaces of indent are removed from blank lines.
int indent = count_leading_spaces(line);
add_line_to_buffer(&content_buffer, line + (indent < 4 ? indent : 4) );
consecutive_blank_lines_in_code++;
(*current_line_index)++;
} else {
break; // Initial blank lines are not part of code block
}
} else {
int indent = count_leading_spaces(line);
if (indent >= 4) {
add_line_to_buffer(&content_buffer, line + 4); // Remove 4 spaces of indent
consecutive_blank_lines_in_code = 0;
(*current_line_index)++;
} else {
break; // Not indented enough, end of code block
}
}
}
// Trim trailing blank lines from content_buffer
while (content_buffer.count > 0 && is_blank_line(content_buffer.lines[content_buffer.count - 1])) {
free(content_buffer.lines[--content_buffer.count]);
}
if (content_buffer.count > 0) {
size_t total_len = 0;
for (int i = 0; i < content_buffer.count; ++i) {
total_len += strlen(content_buffer.lines[i]) + 1; // +1 for \n
}
code_node->code_content = (char*)malloc(total_len);
code_node->code_content[0] = '\0';
for (int i = 0; i < content_buffer.count; ++i) {
strcat(code_node->code_content, content_buffer.lines[i]);
if (i < content_buffer.count - 1) { // Add \n between lines
strcat(code_node->code_content, "\n");
}
}
} else {
// No actual content lines, possibly just blank lines that were trimmed.
// It's still a code block if parse_blocks decided it was.
code_node->code_content = md_strdup("");
}
free_line_buffer(&content_buffer);
return code_node;
}
MDNode* parse_blockquote(const char** lines, int num_lines, int* current_line_index) {
MDNode* bq_node = md_node_new(MD_NODE_BLOCKQUOTE);
LineBuffer bq_content_lines;
init_line_buffer(&bq_content_lines);
bool first_line_in_bq = true;
while (*current_line_index < num_lines) {
const char* line = lines[*current_line_index];
const char* p = line;
p = trim_leading_whitespace(p); // Remove leading spaces on the line itself first
if (*p == '>') {
p++; // Skip '>'
if (*p == ' ' || *p == '\t') p++; // Skip optional one space/tab after '>'
add_line_to_buffer(&bq_content_lines, p);
(*current_line_index)++;
first_line_in_bq = false;
} else if (!first_line_in_bq && !is_blank_line(line)) {
// Lazy continuation: line does not start with '>', but is not blank.
// This is allowed if the blockquote has already started.
add_line_to_buffer(&bq_content_lines, line); // Pass full line for recursive parsing
(*current_line_index)++;
} else {
// Blank line, or line not starting with '>' (and not lazy continuation). End of blockquote.
break;
}
}
if (bq_content_lines.count > 0) {
int temp_idx_bq = 0; // Index for recursive call within blockquote content
parse_blocks((const char**)bq_content_lines.lines, bq_content_lines.count, &temp_idx_bq, bq_node);
}
free_line_buffer(&bq_content_lines);
if (bq_node->children_count == 0 && bq_content_lines.count == 0) { // Nothing was added
md_node_free(bq_node); // If blockquote is empty, discard it
return NULL;
}
return bq_node;
}
MDNode* parse_list(const char** lines, int num_lines, int* current_line_index, int initial_list_indent) {
int item_indent, item_number;
const char* item_content_start;
char item_actual_marker;
char first_marker_char = is_list_item_line(lines[*current_line_index], &item_indent, &item_content_start, &item_number, &item_actual_marker);
MDNodeType list_type = (first_marker_char == '.' || first_marker_char == ')') ? MD_NODE_ORDERED_LIST : MD_NODE_UNORDERED_LIST;
MDNode* list_node = md_node_new(list_type);
if (list_type == MD_NODE_ORDERED_LIST) {
list_node->list_start_number = item_number;
}
list_node->list_item_marker = item_actual_marker; // Store the actual marker char (e.g. '*') for UL, or ('.', ')') for OL.
list_node->tight_list = true; // Assume tight, set to false if blank lines appear appropriately
int prev_item_end_line = -1; // Track end line of previous item to check for blank lines between items
while (*current_line_index < num_lines) {
const char* current_item_line_str = lines[*current_line_index];
int current_item_indent, current_item_number;
const char* current_item_content_s;
char current_item_actual_marker;
char current_marker_type = is_list_item_line(current_item_line_str, &current_item_indent, &current_item_content_s, &current_item_number, &current_item_actual_marker);
if (!current_marker_type || current_item_indent < initial_list_indent) {
break; // Not a list item for this list, or indented less (ends current list)
}
bool same_list_type_check = (list_type == MD_NODE_ORDERED_LIST && (current_marker_type == '.' || current_marker_type == ')')) ||
(list_type == MD_NODE_UNORDERED_LIST && (current_marker_type != '.' && current_marker_type != ')'));
// For unordered lists, marker type can change (- then *). For ordered, delimiter (. or )) should be consistent for same list (CommonMark rule).
// My `list_item_marker` on `list_node` stores the *first* item's marker.
// A stricter check for OL: if (list_type == MD_NODE_ORDERED_LIST && current_item_actual_marker != list_node->list_item_marker) break;
if (!same_list_type_check) {
break; // Type changed, so new list.
}
// Check for blank lines between this item and the previous one
if (prev_item_end_line != -1 && *current_line_index > prev_item_end_line) {
for (int i = prev_item_end_line; i < *current_line_index; ++i) {
if (is_blank_line(lines[i])) {
list_node->tight_list = false;
break;
}
}
}
MDNode* item_node = md_node_new(MD_NODE_LIST_ITEM);
item_node->list_item_marker = current_item_actual_marker;
if (list_type == MD_NODE_ORDERED_LIST && item_node->parent == list_node && list_node->children_count == 0) {
// This is the first item of an ordered list. Set its number.
// The list_node->list_start_number is already set. Item nodes don't typically store their own number.
}
LineBuffer item_content_lines_buf;
init_line_buffer(&item_content_lines_buf);
add_line_to_buffer(&item_content_lines_buf, current_item_content_s); // First line of item content
int item_first_line_idx = *current_line_index;
(*current_line_index)++; // Consumed item marker line
// Calculate required indentation for continuation lines of this item
// Content column is indent of first content char relative to line start
int content_column_for_item = current_item_content_s - current_item_line_str;
// Or, marker length + space (e.g. "- " is 2, "1. " is 3)
int marker_len_plus_space = (current_item_content_s - (current_item_line_str + current_item_indent)) + current_item_indent;
bool item_ended_with_blank_line = false;
bool item_contains_multiple_blocks_or_internal_blanks = false;
while (*current_line_index < num_lines) {
const char* next_line_str = lines[*current_line_index];
int next_line_indent_val, dummy_num; const char* dummy_content; char dummy_marker;
char next_line_is_list_item_marker = is_list_item_line(next_line_str, &next_line_indent_val, &dummy_content, &dummy_num, &dummy_marker);
if (next_line_is_list_item_marker && next_line_indent_val >= initial_list_indent) {
break; // New sibling item starts, current item ends.
}
if (is_blank_line(next_line_str)) {
// A blank line. If it's followed by properly indented content, it's part of this item.
// It might make the list loose.
if (*current_line_index + 1 < num_lines) {
const char* after_blank_line = lines[*current_line_index + 1];
int after_blank_indent = count_leading_spaces(after_blank_line);
char next_next_is_item = is_list_item_line(after_blank_line, &next_line_indent_val, &dummy_content, &dummy_num, &dummy_marker);
if (after_blank_indent >= marker_len_plus_space && !next_next_is_item) {
// Indented enough to continue item, and not a new item marker itself.
add_line_to_buffer(&item_content_lines_buf, ""); // Add the blank line
item_ended_with_blank_line = true; // Mark that a blank line was consumed for this item
item_contains_multiple_blocks_or_internal_blanks = true;
} else {
break; // Blank line not part of this item (e.g. separates from next block or ends list)
}
} else { // Blank line at EOF
break;
}
} else { // Not a blank line
int current_content_line_indent = count_leading_spaces(next_line_str);
if (current_content_line_indent >= marker_len_plus_space) { // Properly indented continuation line
// Add line with indentation removed up to marker_len_plus_space
add_line_to_buffer(&item_content_lines_buf, next_line_str + marker_len_plus_space);
item_ended_with_blank_line = false; // Reset if content follows a blank
} else {
break; // Not indented enough to be part of this item.
}
}
(*current_line_index)++;
}
// Parse collected content for this item
int temp_item_idx = 0;
parse_blocks((const char**)item_content_lines_buf.lines, item_content_lines_buf.count, &temp_item_idx, item_node);
free_line_buffer(&item_content_lines_buf);
if (item_node->children_count > 1) { // Multiple blocks in item implies loose list.
item_contains_multiple_blocks_or_internal_blanks = true;
}
// If an item has internal blank lines that separate its blocks, or ends with blank lines
// that are part of its content, it contributes to a loose list.
if (item_contains_multiple_blocks_or_internal_blanks) {
list_node->tight_list = false;
}
md_node_add_child(list_node, item_node);
prev_item_end_line = *current_line_index; // current_line_index is now start of next line/item or EOF
}
if (list_node->children_count == 0) { // No items added
md_node_free(list_node);
return NULL;
}
return list_node;
}
MDNode* parse_horizontal_rule(const char* line) {
// line is already trimmed of leading spaces by caller
const char* p = line;
char c = *p;
if (c != '*' && c != '-' && c != '_') return NULL;
int count = 0;
while (*p) {
if (*p == c) {
count++;
} else if (!isspace((unsigned char)*p)) {
return NULL; // Non-marker, non-space char found
}
p++;
}
if (count < 3) return NULL; // Needs at least 3 markers
return md_node_new(MD_NODE_HORIZONTAL_RULE);
}
MDNode* parse_blocks(const char** lines, int num_lines, int* current_line_index, MDNode* parent_container) {
LineBuffer paragraph_buffer;
init_line_buffer(&paragraph_buffer);
while (*current_line_index < num_lines) {
const char* line_orig = lines[*current_line_index];
const char* line_trimmed_leading = trim_leading_whitespace(line_orig);
int initial_indent_spaces = count_leading_spaces(line_orig);
// 1. Try Setext heading if paragraph_buffer has content
if (paragraph_buffer.count > 0) {
int setext_level = 0;
const char* p_setext = line_trimmed_leading;
char setext_char_test = *p_setext;
if (setext_char_test == '=' || setext_char_test == '-') {
while (*p_setext == setext_char_test) p_setext++;
if (is_blank_line(p_setext)) { // Line consists only of '=' or '-'
bool is_hr_candidate = false;
if (setext_char_test == '-') {
MDNode* hr_test = parse_horizontal_rule(line_trimmed_leading);
if (hr_test) { is_hr_candidate = true; md_node_free(hr_test); }
}
if (!is_hr_candidate || setext_char_test == '=') { // '=' underlines are not HRs
setext_level = (setext_char_test == '=') ? 1 : 2;
}
}
}
if (setext_level > 0) {
MDNode* heading_node = md_node_new(MD_NODE_HEADING);
heading_node->heading_level = setext_level;
size_t total_para_len = 0;
for (int i = 0; i < paragraph_buffer.count; ++i) total_para_len += strlen(paragraph_buffer.lines[i]) + 1;
char* full_text = (char*)malloc(total_para_len + 1); // +1 for null terminator
if(full_text) {
full_text[0] = '\0';
for (int i = 0; i < paragraph_buffer.count; ++i) {
strcat(full_text, paragraph_buffer.lines[i]);
if (i < paragraph_buffer.count - 1) strcat(full_text, " "); // Join with space
}
parse_inlines(heading_node, full_text);
free(full_text);
}
md_node_add_child(parent_container, heading_node);
free_line_buffer(&paragraph_buffer);
(*current_line_index)++; // Consume the setext marker line
continue;
}
}
// 2. Flush paragraph if current line starts a new block type or is blank
bool line_starts_new_non_para_block = false;
if (*line_trimmed_leading == '#') line_starts_new_non_para_block = true; // ATX Heading
else if (strncmp(line_trimmed_leading, "```", 3) == 0 || strncmp(line_trimmed_leading, "~~~", 3) == 0) line_starts_new_non_para_block = true; // Fenced Code
else if (parse_horizontal_rule(line_trimmed_leading) != NULL) { // HR
MDNode* hr_test = parse_horizontal_rule(line_trimmed_leading); // Need to free this test node
if (hr_test) { line_starts_new_non_para_block = true; md_node_free(hr_test); }
}
else if (*line_trimmed_leading == '>') line_starts_new_non_para_block = true; // Blockquote
else if (is_list_item_line(line_orig, &(int){0}, &(const char*){NULL}, &(int){0}, &(char){0})) line_starts_new_non_para_block = true; // List item
else if (paragraph_buffer.count == 0 && initial_indent_spaces >= 4 && !is_blank_line(line_orig)) line_starts_new_non_para_block = true; // Indented code, if para empty
if (paragraph_buffer.count > 0 && (line_starts_new_non_para_block || is_blank_line(line_orig))) {
MDNode* para_node = md_node_new(MD_NODE_PARAGRAPH);
for (int i = 0; i < paragraph_buffer.count; ++i) {
char* current_para_line = paragraph_buffer.lines[i]; // This is already trimmed leading
size_t len = strlen(current_para_line);
bool hard_break = (len >= 2 && current_para_line[len-1] == ' ' && current_para_line[len-2] == ' ');
char* text_to_parse_inlines = md_strdup(current_para_line);
if (hard_break) text_to_parse_inlines[len-2] = '\0'; // Remove trailing spaces for inline parsing
parse_inlines(para_node, text_to_parse_inlines);
free(text_to_parse_inlines);
if (hard_break) {
md_node_add_child(para_node, md_node_new(MD_NODE_HARD_BREAK));
} else if (i < paragraph_buffer.count - 1) {
md_node_add_child(para_node, md_node_new(MD_NODE_SOFT_BREAK));
}
}
if(para_node->children_count > 0) md_node_add_child(parent_container, para_node);
else md_node_free(para_node); // Empty paragraph
free_line_buffer(&paragraph_buffer);
}
// 3. Process current line based on its type
if (is_blank_line(line_orig)) {
(*current_line_index)++;
continue;
}
MDNode* new_block_node = NULL;
if (*line_trimmed_leading == '#') { // ATX Heading
new_block_node = parse_heading_atx(line_trimmed_leading);
if (new_block_node) (*current_line_index)++;
} else if (strncmp(line_trimmed_leading, "```", 3) == 0 || strncmp(line_trimmed_leading, "~~~", 3) == 0) { // Fenced Code
new_block_node = parse_fenced_code_block(lines, num_lines, current_line_index); // Advances index internally
} else {
MDNode* hr_try = parse_horizontal_rule(line_trimmed_leading); // HR
if (hr_try) {
new_block_node = hr_try;
(*current_line_index)++;
}
}
if (!new_block_node && *line_trimmed_leading == '>') { // Blockquote
new_block_node = parse_blockquote(lines, num_lines, current_line_index); // Advances index internally
}
if (!new_block_node && is_list_item_line(line_orig, &(int){0}, &(const char*){NULL}, &(int){0}, &(char){0})) { // List item
new_block_node = parse_list(lines, num_lines, current_line_index, initial_indent_spaces); // Advances index internally
}
if (!new_block_node && initial_indent_spaces >= 4 && paragraph_buffer.count == 0) { // Indented Code Block (only if not continuing a paragraph)
new_block_node = parse_code_block_indented(lines, num_lines, current_line_index); // Advances index internally
}
if (new_block_node) {
md_node_add_child(parent_container, new_block_node);
// Index already advanced by specific parsers or above
continue;
}
// 4. If nothing else, it's a paragraph line
// Line content for paragraph is trimmed of leading/trailing whitespace common to the block,
// but internal structure (like trailing spaces for hard break) is preserved.
char* para_line_content = md_strdup(line_trimmed_leading);
// Don't trim_trailing_whitespace here, as it might remove " " for hard break.
add_line_to_buffer(&paragraph_buffer, para_line_content);
free(para_line_content);
(*current_line_index)++;
}
// Final flush of paragraph_buffer
if (paragraph_buffer.count > 0) {
MDNode* para_node = md_node_new(MD_NODE_PARAGRAPH);
for (int i = 0; i < paragraph_buffer.count; ++i) {
char* current_para_line = paragraph_buffer.lines[i];
size_t len = strlen(current_para_line);
bool hard_break = (len >= 2 && current_para_line[len-1] == ' ' && current_para_line[len-2] == ' ');
char* text_to_parse_inlines = md_strdup(current_para_line);
if (hard_break) text_to_parse_inlines[len-2] = '\0';
parse_inlines(para_node, text_to_parse_inlines);
free(text_to_parse_inlines);
if (hard_break) {
md_node_add_child(para_node, md_node_new(MD_NODE_HARD_BREAK));
} else if (i < paragraph_buffer.count - 1) {
md_node_add_child(para_node, md_node_new(MD_NODE_SOFT_BREAK));
}
}
if(para_node->children_count > 0) md_node_add_child(parent_container, para_node);
else md_node_free(para_node);
free_line_buffer(&paragraph_buffer);
}
return parent_container;
}
MDNode* parse_document_from_lines(const char** lines, int num_lines) {
MDNode* doc = md_node_new(MD_NODE_DOCUMENT);
int current_line_idx = 0;
parse_blocks(lines, num_lines, &current_line_idx, doc);
return doc;
}
MDNode* parse_markdown(const char* markdown_text) {
LineBuffer line_buf;
init_line_buffer(&line_buf);
const char* p = markdown_text;
const char* line_start = p;
while (1) {
const char* line_end = strchr(line_start, '\n');
bool last_line = (line_end == NULL);
size_t current_line_len = last_line ? strlen(line_start) : (line_end - line_start);
char* segment = md_strndup(line_start, current_line_len);
if (segment) {
char* expanded_segment = expand_tabs(segment); // Expand tabs
if (expanded_segment) {
add_line_to_buffer(&line_buf, expanded_segment);
free(expanded_segment);
} else { // Fallback if expand_tabs fails
add_line_to_buffer(&line_buf, segment);
}
free(segment);
}
if (last_line) break;
line_start = line_end + 1;
}
MDNode* doc = parse_document_from_lines((const char**)line_buf.lines, line_buf.count);
free_line_buffer(&line_buf);
return doc;
}
// --- AST Printer for Demo/Debug ---
void print_ast_node(MDNode* node, int indent_level) {
if (!node) return;
for (int i = 0; i < indent_level; ++i) printf(" ");
switch (node->type) {
case MD_NODE_DOCUMENT: printf("DOCUMENT\n"); break;
case MD_NODE_PARAGRAPH: printf("PARAGRAPH\n"); break;
case MD_NODE_HEADING: printf("HEADING (Level %d)\n", node->heading_level); break;
case MD_NODE_BLOCKQUOTE: printf("BLOCKQUOTE\n"); break;
case MD_NODE_UNORDERED_LIST: printf("UNORDERED_LIST (Marker: '%c', Tight: %s)\n", node->list_item_marker, node->tight_list ? "yes" : "no"); break;
case MD_NODE_ORDERED_LIST: printf("ORDERED_LIST (Start: %d, Marker: '%c', Tight: %s)\n", node->list_start_number, node->list_item_marker, node->tight_list ? "yes" : "no"); break;
case MD_NODE_LIST_ITEM: printf("LIST_ITEM (Marker: '%c')\n", node->list_item_marker); break;
case MD_NODE_CODE_BLOCK: printf("CODE_BLOCK (Indented)\n");
if(node->code_content && strlen(node->code_content) < 60) {
for (int i = 0; i < indent_level + 1; ++i) printf(" "); printf("Content: [[%s]]\n", node->code_content);
} else if (node->code_content) {
for (int i = 0; i < indent_level + 1; ++i) printf(" "); printf("Content: (long)\n");
} break;
case MD_NODE_FENCED_CODE_BLOCK: printf("FENCED_CODE_BLOCK (Lang: %s)\n", node->code_language ? node->code_language : "(none)");
if(node->code_content && strlen(node->code_content) < 60) {
for (int i = 0; i < indent_level + 1; ++i) printf(" "); printf("Content: [[%s]]\n", node->code_content);
} else if (node->code_content) {
for (int i = 0; i < indent_level + 1; ++i) printf(" "); printf("Content: (long)\n");
} break;
case MD_NODE_HORIZONTAL_RULE: printf("HORIZONTAL_RULE\n"); break;
case MD_NODE_HTML_BLOCK: printf("HTML_BLOCK: [[%s]]\n", node->text_content ? node->text_content : ""); break;
case MD_NODE_TEXT: printf("TEXT: \"%s\"\n", node->text_content ? node->text_content : ""); break;
case MD_NODE_EMPHASIS: printf("EMPHASIS\n"); break;
case MD_NODE_STRONG: printf("STRONG\n"); break;
case MD_NODE_STRIKETHROUGH: printf("STRIKETHROUGH\n"); break;
case MD_NODE_INLINE_CODE: printf("INLINE_CODE: `\"%s\"`\n", node->text_content ? node->text_content : ""); break;
case MD_NODE_LINK: printf("LINK (URL: %s, Title: %s)\n", node->url ? node->url : "", node->title ? node->title : ""); break;
case MD_NODE_IMAGE: printf("IMAGE (Alt: %s, URL: %s, Title: %s)\n", node->alt_text ? node->alt_text : "", node->url ? node->url : "", node->title ? node->title : ""); break;
case MD_NODE_SOFT_BREAK: printf("SOFT_BREAK\n"); break;
case MD_NODE_HARD_BREAK: printf("HARD_BREAK\n"); break;
case MD_NODE_HTML_INLINE: printf("HTML_INLINE: %s\n", node->text_content ? node->text_content : ""); break;
case MD_NODE_ESCAPED_CHAR: printf("ESCAPED_CHAR: \\%s\n", node->text_content ? node->text_content : ""); break;
default: printf("UNKNOWN_NODE (%d)\n", node->type); break;
}
for (size_t i = 0; i < node->children_count; ++i) {
print_ast_node(node->children[i], indent_level + 1);
}
}
// --- Main Function (Example Usage) ---
int main() {
const char* markdown_example =
"# Welcome to Markdown\n\n"
"This is a paragraph with *italic* and **bold** text.\n"
"And a line with two spaces at the end for a hard break. \n"
"Next line.\n\n"
"Another paragraph with `inline code` and a [link](http://example.com \"Optional Title\").\n"
"An image: ![Alt text](/path/to/image.jpg \"Image Title\")\n\n"
"> This is a blockquote.\n"
"> With multiple lines.\n\n"
"And a lazy continuation\nfor the blockquote.\n\n"
"## Sub Heading\n\n"
"Setext L1\n"
"=========\n\n"
"Setext L2\n"
"---------\n\n"
"Indented code block:\n\n"
" int main() {\n"
" printf(\"Hello\");\n"
" }\n\n"
"Fenced code block:\n"
"```c\n"
"void func() {\n"
" // comment\n"
"}\n"
"```\n\n"
"Unordered List (tight):\n"
"- Item 1\n"
" - Nested Item 1.1\n"
" - Nested Item 1.2\n"
"- Item 2\n"
" * With more indent\n\n"
"Ordered List (tight):\n"
"1. First\n"
"2. Second\n"
" 1) Nested first (marker change)\n"
" 2) Nested second\n"
"3. Third\n\n"
"* Loose list item 1\n\n"
"* Loose list item 2\n"
" This is content for item 2.\n\n"
" Still item 2, after blank line.\n\n"
"* Loose list item 3\n\n"
"Horizontal Rule:\n"
"---\n\n"
"Escaped chars: \\*hello\\* \\`code\\` \\[link\\]\n"
"Inline HTML: <custom-tag attr=\"val\">content</custom-tag> also <br/> this is text.\n"
"Autolink: <http://google.com>\n"
"Not an autolink: <[email protected]> (mailto: is required)\n"
"Autolink mail: <mailto:[email protected]>\n";
printf("Parsing Markdown:\n%s\n", markdown_example);
MDNode* doc = parse_markdown(markdown_example);
if (doc) {
printf("\nAST Structure:\n");
print_ast_node(doc, 0);
md_node_free(doc);
} else {
printf("Failed to parse document.\n");
}
return 0;
}
// This is a translation of md.c
const std = @import("std");
const Allocator = std.mem.Allocator;
const ArrayList = std.ArrayList;
const print = std.debug.print;
const INITIAL_CHILDREN_CAPACITY = 4;
const TAB_STOP_WIDTH = 4;
// --- Enums and Structs ---
const MdNodeType = enum {
Document,
Paragraph,
Heading,
Blockquote,
UnorderedList,
OrderedList,
ListItem,
CodeBlock, // Indented code block
FencedCodeBlock,
HorizontalRule,
HtmlBlock, // Basic support
// Inline elements
Text,
Emphasis,
Strong,
Strikethrough,
InlineCode,
Link,
Image,
SoftBreak,
HardBreak,
HtmlInline, // Basic support
EscapedChar,
};
const MdNode = struct {
allocator: Allocator,
node_type: MdNodeType,
text_content: ?[]const u8 = null, // For TEXT, INLINE_CODE, HTML_INLINE/BLOCK, ESCAPED_CHAR
code_language: ?[]const u8 = null, // For FENCED_CODE_BLOCK
code_content: ?[]const u8 = null, // For CODE_BLOCK, FENCED_CODE_BLOCK
heading_level: u8 = 0, // For HEADING (1-6)
url: ?[]const u8 = null, // For LINK, IMAGE
title: ?[]const u8 = null, // For LINK, IMAGE (optional)
alt_text: ?[]const u8 = null, // For IMAGE
list_item_marker: u8 = 0, // For LIST_ITEM ('*', '-', '+', '.', ')')
list_start_number: usize = 0, // For ORDERED_LIST
tight_list: bool = false, // For lists
parent: ?*MdNode = null, // Optional
children: ArrayList(*MdNode),
pub fn create(allocator: Allocator, node_type: MdNodeType) !*MdNode {
const node = try allocator.create(MdNode);
node.* = MdNode{
.allocator = allocator,
.node_type = node_type,
.children = ArrayList(*MdNode).init(allocator),
};
return node;
}
pub fn deinit(self: *MdNode) void {
if (self.text_content) |tc| self.allocator.free(tc);
if (self.code_language) |cl| self.allocator.free(cl);
if (self.code_content) |cc| self.allocator.free(cc);
if (self.url) |u| self.allocator.free(u);
if (self.title) |t| self.allocator.free(t);
if (self.alt_text) |at| self.allocator.free(at);
for (self.children.items) |child_node| {
child_node.deinit();
}
self.children.deinit();
self.allocator.destroy(self);
}
pub fn addChild(self: *MdNode, child: *MdNode) !void {
try self.children.append(child);
child.parent = self;
}
};
// --- Utility Functions ---
fn dupeSlice(allocator: Allocator, s: []const u8) ![]const u8 {
return try allocator.dupe(u8, s);
}
fn trimLeadingWhitespace(str: []const u8) []const u8 {
return std.mem.trimLeft(u8, str, " \t\r\n");
}
fn trimTrailingWhitespace(str: []const u8) []const u8 {
return std.mem.trimRight(u8, str, " \t\r\n");
}
fn trimWhitespace(str: []const u8) []const u8 {
return std.mem.trim(u8, str, " \t\r\n");
}
fn countLeadingSpaces(line: []const u8) usize {
var count: usize = 0;
for (line) |char| {
if (char == ' ') {
count += 1;
} else {
break;
}
}
return count;
}
fn isBlankLine(line: []const u8) bool {
for (line) |char| {
if (!std.ascii.isSpace(char)) {
return false;
}
}
return true;
}
fn expandTabs(allocator: Allocator, line: []const u8) ![]u8 {
var new_line = ArrayList(u8).init(allocator);
errdefer new_line.deinit();
var current_col: usize = 0;
for (line) |char| {
if (char == '\t') {
const spaces_to_add = TAB_STOP_WIDTH - (current_col % TAB_STOP_WIDTH);
var i: usize = 0;
while (i < spaces_to_add) : (i += 1) {
try new_line.append(' ');
}
current_col += spaces_to_add;
} else {
try new_line.append(char);
current_col += 1;
}
}
return new_line.toOwnedSlice();
}
// --- Line Buffer for Block Parsing ---
const LineBuffer = struct {
allocator: Allocator,
lines: ArrayList([]const u8), // Owns the slices
pub fn init(allocator: Allocator) LineBuffer {
return LineBuffer{
.allocator = allocator,
.lines = ArrayList([]const u8).init(allocator),
};
}
pub fn deinit(self: *LineBuffer) void {
for (self.lines.items) |line| {
self.allocator.free(line);
}
self.lines.deinit();
}
pub fn addLine(self: *LineBuffer, line: []const u8) !void {
try self.lines.append(try dupeSlice(self.allocator, line));
}
pub fn getLines(self: *const LineBuffer) []const []const u8 {
return self.lines.items;
}
};
// --- Forward Declarations for Parsers ---
// Using a struct with function pointers to break cyclic dependencies
const ParserFuncs = struct {
parseInlinesRecursiveFn: fn (allocator: Allocator, parent_node: *MdNode, text_start: []const u8) anyerror!void,
parseInlinesFn: fn (allocator: Allocator, parent_node: *MdNode, text: []const u8) anyerror!void,
parseBlocksFn: fn (allocator: Allocator, funcs: ParserFuncs, lines: []const []const u8, current_line_index: *usize, parent_container: *MdNode) anyerror!*MdNode,
parseHorizontalRuleFn: fn (line_trimmed: []const u8) ?*MdNode,
isListItemLineFn: fn (line: []const u8) ?ListItemInfo,
};
const ListItemInfo = struct {
indent: usize,
content_start: []const u8,
number_if_ordered: usize,
actual_marker_char: u8,
marker_type: u8, // '*', '-', '+', '.', ')'
};
// --- Inline Parsing ---
fn findNextUnescaped(s: []const u8, char_to_find: u8) ?usize {
var i: usize = 0;
while (i < s.len) {
if (s[i] == '\\' and (i + 1) < s.len and s[i + 1] == char_to_find) {
i += 2;
} else if (s[i] == char_to_find) {
return i;
} else {
i += 1;
}
}
return null;
}
fn findMatchingDelimiter(s: []const u8, marker: []const u8) ?usize {
// Start search after the initial marker occurrence which is s[0..marker.len]
var p_idx: usize = marker.len;
while (p_idx <= s.len - marker.len) { // Ensure space for marker
if (std.mem.startsWith(u8, s[p_idx..], marker)) {
// Check if it's an escaped marker, s[p_idx-1]
if (p_idx > 0 and s[p_idx - 1] == '\\') {
var backslashes: usize = 0;
var k = p_idx - 1;
while (k > 0 and s[k] == '\\') : (k -=1) {
backslashes += 1;
}
if (s[k] == '\\') backslashes +=1; // count s[0] if it's a backslash
if (backslashes % 2 == 1) { // Odd number of backslashes means marker is escaped
p_idx += marker.len;
continue;
}
}
return p_idx;
}
p_idx += 1;
}
return null;
}
fn parseInlinesRecursiveImpl(allocator: Allocator, funcs: ParserFuncs, parent_node: *MdNode, text_full: []const u8) !void {
if (text_full.len == 0) return;
var p_idx: usize = 0;
var current_segment_start_idx: usize = 0;
while (p_idx < text_full.len) {
var inline_node_created = false;
var next_p_idx: usize = p_idx;
const p_char = text_full[p_idx];
// 1. Escaped characters
if (p_char == '\\' and (p_idx + 1) < text_full.len and std.mem.indexOfScalar(u8, "*_`~[]()#+-.<>!", text_full[p_idx + 1]) != null) {
if (p_idx > current_segment_start_idx) {
var text_node = try MdNode.create(allocator, .Text);
text_node.text_content = try dupeSlice(allocator, text_full[current_segment_start_idx..p_idx]);
try parent_node.addChild(text_node);
}
var escaped_node = try MdNode.create(allocator, .EscapedChar);
escaped_node.text_content = try dupeSlice(allocator, text_full[p_idx + 1 .. p_idx + 2]);
try parent_node.addChild(escaped_node);
inline_node_created = true;
next_p_idx = p_idx + 2;
}
// 2. Images: ![alt](url "title")
else if (p_char == '!' and (p_idx + 1) < text_full.len and text_full[p_idx + 1] == '[') {
const alt_text_start_idx = p_idx + 2;
if (findNextUnescaped(text_full[alt_text_start_idx..], ']')) |alt_text_end_rel_idx| {
const alt_text_end_idx = alt_text_start_idx + alt_text_end_rel_idx;
if ((alt_text_end_idx + 1) < text_full.len and text_full[alt_text_end_idx + 1] == '(') {
const url_overall_start_idx = alt_text_end_idx + 2;
var paren_balance: i32 = 1;
var url_overall_end_search_idx = url_overall_start_idx;
while (url_overall_end_search_idx < text_full.len and paren_balance > 0) {
if (text_full[url_overall_end_search_idx] == '\\' and (url_overall_end_search_idx + 1) < text_full.len) {
url_overall_end_search_idx += 2; continue;
}
if (text_full[url_overall_end_search_idx] == '(') paren_balance += 1
else if (text_full[url_overall_end_search_idx] == ')') paren_balance -= 1;
if (paren_balance == 0) break;
url_overall_end_search_idx += 1;
}
if (url_overall_end_search_idx < text_full.len and text_full[url_overall_end_search_idx] == ')') {
if (p_idx > current_segment_start_idx) {
var text_node = try MdNode.create(allocator, .Text);
text_node.text_content = try dupeSlice(allocator, text_full[current_segment_start_idx..p_idx]);
try parent_node.addChild(text_node);
}
var image_node = try MdNode.create(allocator, .Image);
image_node.alt_text = try dupeSlice(allocator, text_full[alt_text_start_idx..alt_text_end_idx]);
const link_content_full = text_full[url_overall_start_idx..url_overall_end_search_idx];
var actual_url_end_rel_idx: usize = link_content_full.len;
var title_search_idx: usize = 0;
while(title_search_idx < link_content_full.len) {
if (std.ascii.isSpace(link_content_full[title_search_idx]) and (title_search_idx + 1) < link_content_full.len) {
const quote_char = link_content_full[title_search_idx+1];
if (quote_char == '"' or quote_char == '\'') {
const t_start_rel_idx = title_search_idx + 2;
var t_end_rel_idx = t_start_rel_idx;
while(t_end_rel_idx < link_content_full.len and link_content_full[t_end_rel_idx] != quote_char) {
if (link_content_full[t_end_rel_idx] == '\\' and (t_end_rel_idx+1) < link_content_full.len) t_end_rel_idx +=1;
t_end_rel_idx +=1;
}
if (t_end_rel_idx < link_content_full.len and link_content_full[t_end_rel_idx] == quote_char) {
image_node.title = try dupeSlice(allocator, link_content_full[t_start_rel_idx..t_end_rel_idx]);
actual_url_end_rel_idx = title_search_idx;
break;
}
}
}
title_search_idx += 1;
}
image_node.url = try dupeSlice(allocator, trimWhitespace(link_content_full[0..actual_url_end_rel_idx]));
try parent_node.addChild(image_node);
inline_node_created = true;
next_p_idx = url_overall_end_search_idx + 1;
}
}
}
}
// 3. Links: [text](url "title")
else if (p_char == '[') {
const text_s_idx = p_idx + 1;
if (findNextUnescaped(text_full[text_s_idx..], ']')) |text_e_rel_idx| {
const text_e_idx = text_s_idx + text_e_rel_idx;
if ((text_e_idx + 1) < text_full.len and text_full[text_e_idx + 1] == '(') {
const url_s_idx = text_e_idx + 2;
var bal: i32 = 1;
var url_e_search_idx = url_s_idx;
while (url_e_search_idx < text_full.len and bal > 0) {
if (text_full[url_e_search_idx] == '\\' and (url_e_search_idx + 1) < text_full.len) {
url_e_search_idx += 2; continue;
}
if (text_full[url_e_search_idx] == '(') bal += 1
else if (text_full[url_e_search_idx] == ')') bal -= 1;
if (bal == 0) break;
url_e_search_idx += 1;
}
if (url_e_search_idx < text_full.len and text_full[url_e_search_idx] == ')') {
if (p_idx > current_segment_start_idx) {
var text_node = try MdNode.create(allocator, .Text);
text_node.text_content = try dupeSlice(allocator, text_full[current_segment_start_idx..p_idx]);
try parent_node.addChild(text_node);
}
var link_node = try MdNode.create(allocator, .Link);
const link_content_full = text_full[url_s_idx..url_e_search_idx];
var actual_url_end_rel_idx: usize = link_content_full.len;
var title_search_idx: usize = 0;
while(title_search_idx < link_content_full.len) {
if (std.ascii.isSpace(link_content_full[title_search_idx]) and (title_search_idx + 1) < link_content_full.len) {
const quote_char = link_content_full[title_search_idx+1];
if (quote_char == '"' or quote_char == '\'') {
const t_start_rel_idx = title_search_idx + 2;
var t_end_rel_idx = t_start_rel_idx;
while(t_end_rel_idx < link_content_full.len and link_content_full[t_end_rel_idx] != quote_char) {
if (link_content_full[t_end_rel_idx] == '\\' and (t_end_rel_idx+1) < link_content_full.len) t_end_rel_idx +=1;
t_end_rel_idx +=1;
}
if (t_end_rel_idx < link_content_full.len and link_content_full[t_end_rel_idx] == quote_char) {
link_node.title = try dupeSlice(allocator, link_content_full[t_start_rel_idx..t_end_rel_idx]);
actual_url_end_rel_idx = title_search_idx;
break;
}
}
}
title_search_idx += 1;
}
link_node.url = try dupeSlice(allocator, trimWhitespace(link_content_full[0..actual_url_end_rel_idx]));
try parent_node.addChild(link_node);
try funcs.parseInlinesRecursiveFn(allocator, funcs, link_node, text_full[text_s_idx..text_e_idx]);
inline_node_created = true;
next_p_idx = url_e_search_idx + 1;
}
}
}
}
// Strong (**, __)
else if ((p_char == '*' and (p_idx + 1) < text_full.len and text_full[p_idx + 1] == '*') or
(p_char == '_' and (p_idx + 1) < text_full.len and text_full[p_idx + 1] == '_')) {
const marker = text_full[p_idx .. p_idx + 2];
if (findMatchingDelimiter(text_full[p_idx..], marker)) |end_marker_rel_idx| {
const end_marker_idx = p_idx + end_marker_rel_idx;
if (p_idx > current_segment_start_idx) {
var text_node = try MdNode.create(allocator, .Text);
text_node.text_content = try dupeSlice(allocator, text_full[current_segment_start_idx..p_idx]);
try parent_node.addChild(text_node);
}
const strong_node = try MdNode.create(allocator, .Strong);
try parent_node.addChild(strong_node);
try funcs.parseInlinesRecursiveFn(allocator, funcs, strong_node, text_full[p_idx + 2 .. end_marker_idx]);
inline_node_created = true;
next_p_idx = end_marker_idx + 2;
}
}
// Emphasis (*, _)
else if (p_char == '*' or p_char == '_') {
const marker = text_full[p_idx .. p_idx + 1];
// Need to check that this is not part of a strong marker.
// E.g. in "**foo**", the first '*' should not match as emphasis.
// This is complex with CommonMark rules (left/right flanking, etc.)
// Simplified: if next char is same, it's probably strong, skip.
var is_part_of_strong = false;
if ((p_idx + 1) < text_full.len and text_full[p_idx+1] == p_char) {
is_part_of_strong = true;
}
if (!is_part_of_strong and findMatchingDelimiter(text_full[p_idx..], marker)) |end_marker_rel_idx| {
const end_marker_idx = p_idx + end_marker_rel_idx;
if (end_marker_idx > p_idx + 1) { // Not empty
if (p_idx > current_segment_start_idx) {
var text_node = try MdNode.create(allocator, .Text);
text_node.text_content = try dupeSlice(allocator, text_full[current_segment_start_idx..p_idx]);
try parent_node.addChild(text_node);
}
const emp_node = try MdNode.create(allocator, .Emphasis);
try parent_node.addChild(emp_node);
try funcs.parseInlinesRecursiveFn(allocator, funcs, emp_node, text_full[p_idx + 1 .. end_marker_idx]);
inline_node_created = true;
next_p_idx = end_marker_idx + 1;
}
}
}
// Strikethrough (~~)
else if (p_char == '~' and (p_idx + 1) < text_full.len and text_full[p_idx + 1] == '~') {
if (findMatchingDelimiter(text_full[p_idx..], "~~")) |end_marker_rel_idx| {
const end_marker_idx = p_idx + end_marker_rel_idx;
if (p_idx > current_segment_start_idx) {
var text_node = try MdNode.create(allocator, .Text);
text_node.text_content = try dupeSlice(allocator, text_full[current_segment_start_idx..p_idx]);
try parent_node.addChild(text_node);
}
const strike_node = try MdNode.create(allocator, .Strikethrough);
try parent_node.addChild(strike_node);
try funcs.parseInlinesRecursiveFn(allocator, funcs, strike_node, text_full[p_idx + 2 .. end_marker_idx]);
inline_node_created = true;
next_p_idx = end_marker_idx + 2;
}
}
// Inline Code (`)
else if (p_char == '`') {
// Basic: find next '`'. GFM allows multiple backticks.
var opening_backticks: usize = 1;
var ob_idx = p_idx + 1;
while(ob_idx < text_full.len and text_full[ob_idx] == '`') : (ob_idx+=1) {
opening_backticks +=1;
}
var cb_search_idx = ob_idx;
var end_marker_idx: ?usize = null;
while(cb_search_idx < text_full.len) {
if (text_full[cb_search_idx] == '`') {
var closing_backticks: usize = 1;
var cbt_idx = cb_search_idx + 1;
while(cbt_idx < text_full.len and text_full[cbt_idx] == '`') : (cbt_idx +=1) {
closing_backticks +=1;
}
if (closing_backticks == opening_backticks) {
end_marker_idx = cb_search_idx;
break;
}
cb_search_idx = cbt_idx; // Jump past these backticks
} else {
cb_search_idx +=1;
}
}
if (end_marker_idx) |em_idx| {
if (p_idx > current_segment_start_idx) {
var text_node = try MdNode.create(allocator, .Text);
text_node.text_content = try dupeSlice(allocator, text_full[current_segment_start_idx..p_idx]);
try parent_node.addChild(text_node);
}
var code_node = try MdNode.create(allocator, .InlineCode);
var code_slice = text_full[p_idx + opening_backticks .. em_idx];
// Trim one leading/trailing space if content is not all spaces and starts/ends with space
// and if the code span is not empty after potential trimming
if (code_slice.len >= 2 and code_slice[0] == ' ' and code_slice[code_slice.len-1] == ' ') {
var all_spaces = true;
for (code_slice) |c| {
if (c != ' ') {
all_spaces = false;
break;
}
}
if (!all_spaces) {
code_slice = code_slice[1..code_slice.len-1];
}
} else if (code_slice.len == 1 and code_slice[0] == ' '){
// ` ` should be ` ` not ``
}
code_node.text_content = try dupeSlice(allocator, code_slice);
try parent_node.addChild(code_node);
inline_node_created = true;
next_p_idx = em_idx + opening_backticks;
}
}
// Autolinks <http://...> etc. and basic HTML tags
else if (p_char == '<') {
const remaining_text = text_full[p_idx + 1 ..];
var is_autolink_uri = false;
if (std.mem.startsWith(u8, remaining_text, "http://") or
std.mem.startsWith(u8, remaining_text, "https://") or
std.mem.startsWith(u8, remaining_text, "mailto:") or
std.mem.startsWith(u8, remaining_text, "ftp://")) {
is_autolink_uri = true;
}
if (is_autolink_uri) {
if (findNextUnescaped(remaining_text, '>')) |end_autolink_rel_idx| {
const end_autolink_idx = (p_idx + 1) + end_autolink_rel_idx;
if (p_idx > current_segment_start_idx) {
var text_node = try MdNode.create(allocator, .Text);
text_node.text_content = try dupeSlice(allocator, text_full[current_segment_start_idx..p_idx]);
try parent_node.addChild(text_node);
}
var link_node = try MdNode.create(allocator, .Link);
link_node.url = try dupeSlice(allocator, text_full[p_idx + 1 .. end_autolink_idx]);
var text_child = try MdNode.create(allocator, .Text); // Autolink text is its URL
text_child.text_content = try dupeSlice(allocator, link_node.url.?);
try link_node.addChild(text_child);
try parent_node.addChild(link_node);
inline_node_created = true;
next_p_idx = end_autolink_idx + 1;
}
}
else if ((p_idx + 1) < text_full.len and (std.ascii.isAlpha(text_full[p_idx + 1]) or text_full[p_idx+1] == '/' or text_full[p_idx+1] == '!')) { // Basic HTML tag
if (std.mem.indexOfScalar(u8, text_full[p_idx+1..], '>')) |end_tag_rel_idx| {
const end_tag_idx = (p_idx+1) + end_tag_rel_idx;
if (p_idx > current_segment_start_idx) {
var text_node = try MdNode.create(allocator, .Text);
text_node.text_content = try dupeSlice(allocator, text_full[current_segment_start_idx..p_idx]);
try parent_node.addChild(text_node);
}
var html_node = try MdNode.create(allocator, .HtmlInline);
html_node.text_content = try dupeSlice(allocator, text_full[p_idx .. end_tag_idx + 1]);
try parent_node.addChild(html_node);
inline_node_created = true;
next_p_idx = end_tag_idx + 1;
}
}
}
if (inline_node_created) {
p_idx = next_p_idx;
current_segment_start_idx = p_idx;
} else {
p_idx += 1;
}
}
if (p_idx > current_segment_start_idx) {
var text_node = try MdNode.create(allocator, .Text);
text_node.text_content = try dupeSlice(allocator, text_full[current_segment_start_idx..p_idx]);
try parent_node.addChild(text_node);
}
}
fn parseInlinesImpl(allocator: Allocator, funcs: ParserFuncs, parent_block_node: *MdNode, text: []const u8) !void {
if (text.len == 0) return;
try funcs.parseInlinesRecursiveFn(allocator, funcs, parent_block_node, text);
}
// --- Block Parsing ---
fn isListItemLineImpl(line: []const u8) ?ListItemInfo {
var p_idx: usize = 0;
var current_indent: usize = 0;
while (p_idx < line.len and line[p_idx] == ' ') {
p_idx += 1;
current_indent += 1;
}
if (p_idx >= line.len) return null;
const marker_start_idx = p_idx;
const marker_char = line[p_idx];
if (marker_char == '*' or marker_char == '-' or marker_char == '+') {
p_idx += 1;
if (p_idx >= line.len or std.ascii.isSpace(line[p_idx])) {
var content_start_idx = p_idx;
if (content_start_idx < line.len and line[content_start_idx] == ' ') content_start_idx += 1;
// CommonMark: up to 4 spaces after marker or 1 tab. Simplified to 1 space or end.
// Then skip more spaces before content.
while (content_start_idx < line.len and line[content_start_idx] == ' ' and (content_start_idx < p_idx + 4)) content_start_idx +=1; // up to 4 spaces, but not eating content
if (content_start_idx < line.len and line[content_start_idx] == '\t') content_start_idx +=1; // or one tab
return ListItemInfo{
.indent = current_indent,
.content_start = line[content_start_idx..],
.number_if_ordered = 0,
.actual_marker_char = marker_char,
.marker_type = marker_char,
};
}
}
if (std.ascii.isDigit(marker_char)) {
var num_end_idx = marker_start_idx;
while (num_end_idx < line.len and std.ascii.isDigit(line[num_end_idx])) {
num_end_idx += 1;
}
if (num_end_idx > marker_start_idx and num_end_idx < line.len and (line[num_end_idx] == '.' or line[num_end_idx] == ')')) {
const num_val = std.fmt.parseInt(usize, line[marker_start_idx..num_end_idx], 10) catch 0;
const ol_marker_char = line[num_end_idx];
p_idx = num_end_idx + 1;
if (p_idx >= line.len or std.ascii.isSpace(line[p_idx])) {
var content_start_idx = p_idx;
if (content_start_idx < line.len and line[content_start_idx] == ' ') content_start_idx += 1;
while (content_start_idx < line.len and line[content_start_idx] == ' ' and (content_start_idx < p_idx + 4)) content_start_idx +=1;
if (content_start_idx < line.len and line[content_start_idx] == '\t') content_start_idx +=1;
return ListItemInfo{
.indent = current_indent,
.content_start = line[content_start_idx..],
.number_if_ordered = num_val,
.actual_marker_char = ol_marker_char,
.marker_type = ol_marker_char,
};
}
}
}
return null;
}
fn parseHeadingAtx(allocator: Allocator, funcs: ParserFuncs, line_trimmed: []const u8) !?*MdNode {
var level: u8 = 0;
var p_idx: usize = 0;
while (p_idx < line_trimmed.len and line_trimmed[p_idx] == '#') {
level += 1;
p_idx += 1;
}
if (level == 0 or level > 6) return null;
if (p_idx < line_trimmed.len and !std.ascii.isSpace(line_trimmed[p_idx])) return null;
while (p_idx < line_trimmed.len and std.ascii.isSpace(line_trimmed[p_idx])) p_idx += 1;
var content = line_trimmed[p_idx..];
var end_idx = content.len;
while (end_idx > 0) {
const char_before_end = content[end_idx-1];
if (char_before_end == '#') {
var can_remove_hash = true;
if (end_idx > 1) { // Check char before hash
const char_before_hash = content[end_idx-2];
if (char_before_hash != ' ' and char_before_hash != '\t' and char_before_hash != '#') {
// Check for escaped hash
if (char_before_hash == '\\') {
var backslashes: usize = 0;
var k = end_idx - 2;
while(k > 0 and content[k] == '\\') : (k-=1) { backslashes +=1; }
if (content[k] == '\\') backslashes +=1;
if (backslashes % 2 == 1) can_remove_hash = false; // Escaped hash, don't remove
} else {
can_remove_hash = false; // Non-space, non-# char before hash
}
}
}
if (can_remove_hash) {
end_idx -= 1;
} else {
break;
}
} else if (std.ascii.isSpace(char_before_end)) {
end_idx -=1;
} else break;
}
content = content[0..end_idx];
content = trimTrailingWhitespace(content);
var heading_node = try MdNode.create(allocator, .Heading);
heading_node.heading_level = level;
try funcs.parseInlinesFn(allocator, funcs, heading_node, content);
return heading_node;
}
fn parseFencedCodeBlock(allocator: Allocator, funcs: ParserFuncs, lines: []const []const u8, current_line_index: *usize) !?*MdNode {
_ = funcs; // Not used here
const first_line = lines[current_line_index.*];
const line_indent = countLeadingSpaces(first_line);
const trimmed_first_line = first_line[line_indent..];
if (trimmed_first_line.len < 3) return null;
const fence_char = trimmed_first_line[0];
if (fence_char != '`' and fence_char != '~') return null;
var fence_len: usize = 0;
var p_idx: usize = 0;
while (p_idx < trimmed_first_line.len and trimmed_first_line[p_idx] == fence_char) {
fence_len += 1;
p_idx += 1;
}
if (fence_len < 3) return null;
const lang_start = trimLeadingWhitespace(trimmed_first_line[p_idx..]);
if (fence_char == '`' and std.mem.indexOfScalar(u8, lang_start, '`') != null) {
return null;
}
var code_node = try MdNode.create(allocator, .FencedCodeBlock);
if (lang_start.len > 0) {
var lang_info_trimmed = trimTrailingWhitespace(lang_start);
const first_word_end = std.mem.indexOfScalar(u8, lang_info_trimmed, ' ') orelse lang_info_trimmed.len;
code_node.code_language = try dupeSlice(allocator, lang_info_trimmed[0..first_word_end]);
}
current_line_index.* += 1;
var content_buffer = LineBuffer.init(allocator);
defer content_buffer.deinit();
while (current_line_index.* < lines.len) {
const current_line_orig = lines[current_line_index.*];
var current_line_ptr_idx: usize = 0;
var current_initial_indent: usize = 0;
while (current_initial_indent < line_indent and current_line_ptr_idx < current_line_orig.len and current_line_orig[current_line_ptr_idx] == ' ') {
current_line_ptr_idx += 1;
current_initial_indent += 1;
}
const current_line_maybe_fenced = current_line_orig[current_line_ptr_idx..];
var closing_fence_len: usize = 0;
var q_idx: usize = 0;
while(q_idx < current_line_maybe_fenced.len and current_line_maybe_fenced[q_idx] == fence_char) {
closing_fence_len += 1;
q_idx += 1;
}
if (closing_fence_len >= fence_len and isBlankLine(trimLeadingWhitespace(current_line_maybe_fenced[q_idx..]))) {
current_line_index.* += 1;
break;
}
const content_line_leading_spaces = countLeadingSpaces(current_line_orig);
const effective_indent_to_remove = @min(content_line_leading_spaces, line_indent);
try content_buffer.addLine(current_line_orig[effective_indent_to_remove..]);
current_line_index.* += 1;
}
var joined_content = ArrayList(u8).init(allocator);
defer joined_content.deinit();
for (content_buffer.getLines(), 0..) |line, i| {
try joined_content.appendSlice(line);
if (i < content_buffer.getLines().len - 1) {
try joined_content.append('\n');
}
}
// GFM strips the *final* newline of the code block content, if one exists
if (joined_content.items.len > 0 and joined_content.items[joined_content.items.len-1] == '\n' and content_buffer.getLines().len > 0) {
code_node.code_content = try dupeSlice(allocator, joined_content.items[0..joined_content.items.len-1]);
} else {
code_node.code_content = try joined_content.toOwnedSlice();
}
if (code_node.code_content.?.len == 0 and content_buffer.getLines().len == 0) {
code_node.code_content = try dupeSlice(allocator, ""); // Ensure it's not null
}
return code_node;
}
fn parseCodeBlockIndented(allocator: Allocator, funcs: ParserFuncs, lines: []const []const u8, current_line_index: *usize) !?*MdNode {
_ = funcs;
var code_node = try MdNode.create(allocator, .CodeBlock);
var content_buffer = LineBuffer.init(allocator);
defer content_buffer.deinit();
while (current_line_index.* < lines.len) {
const line = lines[current_line_index.*];
if (isBlankLine(line)) {
if (content_buffer.getLines().len > 0) {
const indent = countLeadingSpaces(line);
try content_buffer.addLine(line[@min(indent, TAB_STOP_WIDTH)..]);
current_line_index.* += 1;
} else {
break;
}
} else {
const indent = countLeadingSpaces(line);
if (indent >= TAB_STOP_WIDTH) {
try content_buffer.addLine(line[TAB_STOP_WIDTH..]);
current_line_index.* += 1;
} else {
break;
}
}
}
while (content_buffer.getLines().len > 0 and isBlankLine(content_buffer.getLines()[content_buffer.getLines().len - 1])) {
const last_line = content_buffer.lines.pop();
allocator.free(last_line);
}
if (content_buffer.getLines().len > 0) {
var joined_code = ArrayList(u8).init(allocator);
defer joined_code.deinit();
for (content_buffer.getLines(), 0..) |ln, i| {
try joined_code.appendSlice(ln);
if (i < content_buffer.getLines().len - 1) {
try joined_code.append('\n');
}
}
code_node.code_content = try joined_code.toOwnedSlice();
} else {
code_node.code_content = try dupeSlice(allocator, "");
}
return code_node;
}
fn parseBlockquote(allocator: Allocator, funcs: ParserFuncs, lines: []const []const u8, current_line_index: *usize) !?*MdNode {
var bq_node = try MdNode.create(allocator, .Blockquote);
var bq_content_lines = LineBuffer.init(allocator);
var first_line_in_bq = true;
while (current_line_index.* < lines.len) {
const line = lines[current_line_index.*];
const trimmed_line = trimLeadingWhitespace(line);
if (trimmed_line.len > 0 and trimmed_line[0] == '>') {
var content_after_marker = trimmed_line[1..];
if (content_after_marker.len > 0 and (content_after_marker[0] == ' ' or content_after_marker[0] == '\t')) {
content_after_marker = content_after_marker[1..];
}
try bq_content_lines.addLine(content_after_marker);
current_line_index.* += 1;
first_line_in_bq = false;
} else if (!first_line_in_bq and !isBlankLine(line)) {
try bq_content_lines.addLine(line);
current_line_index.* += 1;
} else {
break;
}
}
if (bq_content_lines.getLines().len > 0) {
var temp_idx_bq: usize = 0;
_ = try funcs.parseBlocksFn(allocator, funcs, bq_content_lines.getLines(), &temp_idx_bq, bq_node);
}
bq_content_lines.deinit();
if (bq_node.children.items.len == 0 and bq_content_lines.getLines().len == 0) { // Check if anything meaningful was added
bq_node.deinit();
return null;
}
return bq_node;
}
fn parseList(allocator: Allocator, funcs: ParserFuncs, lines: []const []const u8, current_line_index: *usize, initial_list_indent: usize) !?*MdNode {
const first_item_info = funcs.isListItemLineFn(lines[current_line_index.*]) orelse return null;
const list_type = if (first_item_info.marker_type == '.' or first_item_info.marker_type == ')') .OrderedList else .UnorderedList;
var list_node = try MdNode.create(allocator, list_type);
if (list_type == .OrderedList) {
list_node.list_start_number = first_item_info.number_if_ordered;
}
list_node.list_item_marker = first_item_info.actual_marker_char;
list_node.tight_list = true;
var prev_item_end_line: usize = current_line_index.*;
while (current_line_index.* < lines.len) {
const current_item_line_str = lines[current_line_index.*];
const current_item_parsed_info = funcs.isListItemLineFn(current_item_line_str) orelse break;
if (current_item_parsed_info.indent < initial_list_indent) break;
const current_marker_is_ordered = (current_item_parsed_info.marker_type == '.' or current_item_parsed_info.marker_type == ')');
const list_node_is_ordered = (list_node.node_type == .OrderedList);
if (current_marker_is_ordered != list_node_is_ordered) break;
if (list_node_is_ordered and current_item_parsed_info.actual_marker_char != list_node.list_item_marker and list_node.children.items.len > 0) {
// CommonMark: Ordered list delimiter can change for a new list, but not usually within the same list.
// This is a simplification; sublists can have different delimiters.
// If this is the *first* item, its marker sets the list_node marker.
// For subsequent items, if the delimiter changes, it might indicate a new list.
// break; // For stricter delimiter matching.
}
if (list_node.children.items.len > 0) {
var k = prev_item_end_line;
while (k < current_line_index.*) {
if (isBlankLine(lines[k])) {
list_node.tight_list = false;
break;
}
k += 1;
}
}
var item_node = try MdNode.create(allocator, .ListItem);
item_node.list_item_marker = current_item_parsed_info.actual_marker_char;
var item_content_lines_buf = LineBuffer.init(allocator);
defer item_content_lines_buf.deinit();
try item_content_lines_buf.addLine(current_item_parsed_info.content_start);
const item_first_line_consumed_idx = current_line_index.*;
current_line_index.* += 1;
const marker_and_indent_len = (current_item_parsed_info.content_start.ptr - lines[item_first_line_consumed_idx].ptr);
var item_ended_with_blank_line_for_loose_check = false;
var item_contains_multiple_blocks_or_internal_blanks = false;
while (current_line_index.* < lines.len) {
const next_line_str = lines[current_line_index.*];
if (funcs.isListItemLineFn(next_line_str)) |next_line_item_info| {
if (next_line_item_info.indent >= initial_list_indent) {
const next_is_ol = (next_line_item_info.marker_type == '.' or next_line_item_info.marker_type == ')');
if (next_is_ol == list_node_is_ordered) {
if (!list_node_is_ordered or next_line_item_info.actual_marker_char == list_node.list_item_marker or list_node.children.items.len == 0) {
break;
}
// If OL and delimiter changed, could be a new list.
}
}
}
if (isBlankLine(next_line_str)) {
item_ended_with_blank_line_for_loose_check = true; // Mark blank line encountered
if (current_line_index.* + 1 < lines.len) {
const after_blank_line = lines[current_line_index.* + 1];
const after_blank_indent = countLeadingSpaces(after_blank_line);
const next_next_is_item = (funcs.isListItemLineFn(after_blank_line) != null);
if (after_blank_indent >= marker_and_indent_len and !next_next_is_item) {
try item_content_lines_buf.addLine("");
item_contains_multiple_blocks_or_internal_blanks = true; // Internal blank implies loose
} else {
break;
}
} else { break; }
} else {
const current_content_line_indent = countLeadingSpaces(next_line_str);
if (current_content_line_indent >= marker_and_indent_len) {
try item_content_lines_buf.addLine(next_line_str[marker_and_indent_len..]);
item_ended_with_blank_line_for_loose_check = false; // Content followed, reset
} else {
break;
}
}
current_line_index.* += 1;
}
var temp_item_idx: usize = 0;
_ = try funcs.parseBlocksFn(allocator, funcs, item_content_lines_buf.getLines(), &temp_item_idx, item_node);
if (item_node.children.items.len > 1) {
item_contains_multiple_blocks_or_internal_blanks = true;
}
// If an item ends with a blank line *that is part of its content* (i.e., followed by more indented content for the same item, or it's the last item and the blank is before EOF/next non-list block), it makes the list loose.
// My `item_ended_with_blank_line_for_loose_check` is true if the *last* line processed for the item's content was blank.
// This check, plus multiple blocks, determines looseness.
if (item_contains_multiple_blocks_or_internal_blanks) { // Stricter: or (item_ended_with_blank_line_for_loose_check and item_node.children.items.len > 0)
list_node.tight_list = false;
}
try list_node.addChild(item_node);
prev_item_end_line = current_line_index.*;
}
if (list_node.children.items.len == 0) {
list_node.deinit();
return null;
}
return list_node;
}
fn parseHorizontalRuleImpl(line_trimmed: []const u8) ?*MdNode {
if (line_trimmed.len == 0) return null;
const char_c = line_trimmed[0];
if (char_c != '*' and char_c != '-' and char_c != '_') return null;
var count: usize = 0;
for (line_trimmed) |c| {
if (c == char_c) {
count += 1;
} else if (!std.ascii.isSpace(c)) {
return null;
}
}
if (count < 3) return null;
// Use testing allocator for this one-off node as it's for checks mainly.
// The main parseBlocks will create its own with proper allocator.
return MdNode.create(std.testing.allocator, .HorizontalRule) catch null;
}
fn parseBlocksImpl(allocator: Allocator, funcs: ParserFuncs, lines: []const []const u8, current_line_index: *usize, parent_container: *MdNode) !*MdNode {
var paragraph_buffer = LineBuffer.init(allocator);
defer paragraph_buffer.deinit();
while (current_line_index.* < lines.len) {
const line_orig = lines[current_line_index.*];
const line_trimmed_leading = trimLeadingWhitespace(line_orig);
const initial_indent_spaces = line_trimmed_leading.ptr - line_orig.ptr;
if (paragraph_buffer.getLines().len > 0 and line_trimmed_leading.len > 0) {
var setext_level: u8 = 0;
const setext_char_test = line_trimmed_leading[0];
if (setext_char_test == '=' or setext_char_test == '-') {
var is_all_marker = true;
for (line_trimmed_leading) |c| {
if (c != setext_char_test) {is_all_marker = false; break;}
}
if (is_all_marker) {
var is_hr_candidate = false;
if (setext_char_test == '-') {
if (funcs.parseHorizontalRuleFn(line_trimmed_leading)) |hr_node| {
is_hr_candidate = true;
hr_node.deinit();
}
}
if (!is_hr_candidate or setext_char_test == '=') {
setext_level = if (setext_char_test == '=') 1 else 2;
}
}
}
if (setext_level > 0) {
var heading_node = try MdNode.create(allocator, .Heading);
heading_node.heading_level = setext_level;
var full_text_list = ArrayList(u8).init(allocator);
defer full_text_list.deinit();
for (paragraph_buffer.getLines(), 0..) |para_line, i| {
try full_text_list.appendSlice(para_line);
if (i < paragraph_buffer.getLines().len - 1) try full_text_list.append('\n'); // CommonMark joins with newline for setext
}
try funcs.parseInlinesFn(allocator, funcs, heading_node, full_text_list.items);
try parent_container.addChild(heading_node);
for (paragraph_buffer.lines.items) |l| allocator.free(l);
paragraph_buffer.lines.clearRetainingCapacity();
current_line_index.* += 1;
continue;
}
}
var line_starts_new_non_para_block = false;
if (line_trimmed_leading.len > 0) {
const first_char = line_trimmed_leading[0];
if (first_char == '#') {
line_starts_new_non_para_block = true;
} else if (std.mem.startsWith(u8, line_trimmed_leading, "```") or std.mem.startsWith(u8, line_trimmed_leading, "~~~")) {
line_starts_new_non_para_block = true;
} else if (funcs.parseHorizontalRuleFn(line_trimmed_leading)) |hr_node| {
line_starts_new_non_para_block = true; hr_node.deinit();
} else if (first_char == '>') {
line_starts_new_non_para_block = true;
} else if (funcs.isListItemLineFn(line_orig) != null) {
line_starts_new_non_para_block = true;
}
}
if (paragraph_buffer.getLines().len == 0 and initial_indent_spaces >= TAB_STOP_WIDTH and !isBlankLine(line_orig)) line_starts_new_non_para_block = true;
if (paragraph_buffer.getLines().len > 0 and (line_starts_new_non_para_block or isBlankLine(line_orig))) {
var para_node = try MdNode.create(allocator, .Paragraph);
for (paragraph_buffer.getLines(), 0..) |para_line, i| {
var current_para_line_slice = para_line;
var hard_break = false;
if (current_para_line_slice.len >= 2 and
current_para_line_slice[current_para_line_slice.len-1] == ' ' and
current_para_line_slice[current_para_line_slice.len-2] == ' ') {
hard_break = true;
current_para_line_slice = current_para_line_slice[0..current_para_line_slice.len-2];
}
try funcs.parseInlinesFn(allocator, funcs, para_node, current_para_line_slice);
if (hard_break) {
try para_node.addChild(try MdNode.create(allocator, .HardBreak));
} else if (i < paragraph_buffer.getLines().len - 1) {
try para_node.addChild(try MdNode.create(allocator, .SoftBreak));
}
}
if (para_node.children.items.len > 0) try parent_container.addChild(para_node)
else para_node.deinit();
for (paragraph_buffer.lines.items) |l| allocator.free(l);
paragraph_buffer.lines.clearRetainingCapacity();
}
if (isBlankLine(line_orig)) {
current_line_index.* += 1;
continue;
}
var new_block_node: ?*MdNode = null;
if (line_trimmed_leading.len > 0) {
const first_char = line_trimmed_leading[0];
if (first_char == '#') {
new_block_node = try parseHeadingAtx(allocator, funcs, line_trimmed_leading);
if (new_block_node != null) current_line_index.* += 1;
} else if (std.mem.startsWith(u8, line_trimmed_leading, "```") or std.mem.startsWith(u8, line_trimmed_leading, "~~~")) {
new_block_node = try parseFencedCodeBlock(allocator, funcs, lines, current_line_index);
} else if (funcs.parseHorizontalRuleFn(line_trimmed_leading)) |_| {
new_block_node = try MdNode.create(allocator, .HorizontalRule); // Create with proper allocator
current_line_index.* += 1;
}
if (new_block_node == null and first_char == '>') {
new_block_node = try parseBlockquote(allocator, funcs, lines, current_line_index);
}
}
if (new_block_node == null and funcs.isListItemLineFn(line_orig) != null) {
new_block_node = try parseList(allocator, funcs, lines, current_line_index, initial_indent_spaces);
}
if (new_block_node == null and initial_indent_spaces >= TAB_STOP_WIDTH and paragraph_buffer.getLines().len == 0) {
new_block_node = try parseCodeBlockIndented(allocator, funcs, lines, current_line_index);
}
if (new_block_node) |nbn| {
try parent_container.addChild(nbn);
continue;
}
// Don't trim trailing whitespace from paragraph lines yet (for hard breaks)
try paragraph_buffer.addLine(line_trimmed_leading);
current_line_index.* += 1;
}
if (paragraph_buffer.getLines().len > 0) {
var para_node = try MdNode.create(allocator, .Paragraph);
for (paragraph_buffer.getLines(), 0..) |para_line, i| {
var current_para_line_slice = para_line;
var hard_break = false;
if (current_para_line_slice.len >= 2 and
current_para_line_slice[current_para_line_slice.len-1] == ' ' and
current_para_line_slice[current_para_line_slice.len-2] == ' ') {
hard_break = true;
current_para_line_slice = current_para_line_slice[0..current_para_line_slice.len-2];
}
try funcs.parseInlinesFn(allocator, funcs, para_node, current_para_line_slice);
if (hard_break) {
try para_node.addChild(try MdNode.create(allocator, .HardBreak));
} else if (i < paragraph_buffer.getLines().len - 1) {
try para_node.addChild(try MdNode.create(allocator, .SoftBreak));
}
}
if (para_node.children.items.len > 0) try parent_container.addChild(para_node)
else para_node.deinit();
}
return parent_container;
}
fn parseDocumentFromLines(allocator: Allocator, funcs: ParserFuncs, lines_input: []const []const u8) !*MdNode {
const doc = try MdNode.create(allocator, .Document);
const current_line_idx: usize = 0;
_ = try funcs.parseBlocksFn(allocator, funcs, lines_input, current_line_idx, doc);
return doc;
}
pub fn parseMarkdown(allocator: Allocator, markdown_text: []const u8) !*MdNode {
const funcs = ParserFuncs{
.parseInlinesRecursiveFn = parseInlinesRecursiveImpl,
.parseInlinesFn = parseInlinesImpl,
.parseBlocksFn = parseBlocksImpl,
.parseHorizontalRuleFn = parseHorizontalRuleImpl,
.isListItemLineFn = isListItemLineImpl,
};
var line_buf_storage = LineBuffer.init(allocator);
defer line_buf_storage.deinit();
var line_iterator = std.mem.splitScalar(u8, markdown_text, '\n');
while (line_iterator.next()) |raw_line| {
var line_to_add = raw_line;
if (line_to_add.len > 0 and line_to_add[line_to_add.len-1] == '\r') { // Handle CRLF
line_to_add = line_to_add[0..line_to_add.len-1];
}
const expanded_line_owned = expandTabs(allocator, line_to_add) catch {
std.debug.print("Warning: expandTabs failed for line: {s}\n", .{line_to_add});
try line_buf_storage.addLine(line_to_add);
continue;
};
defer allocator.free(expanded_line_owned);
try line_buf_storage.addLine(expanded_line_owned);
}
return parseDocumentFromLines(allocator, funcs, line_buf_storage.getLines());
}
// --- AST Printer for Demo/Debug ---
fn printAstNode(node: *MdNode, indent_level: usize) void {
var i: usize = 0;
while (i < indent_level) : (i += 1) { print(" ", .{}); }
switch (node.node_type) {
.Document => print("DOCUMENT\n", .{}),
.Paragraph => print("PARAGRAPH\n", .{}),
.Heading => print("HEADING (Level {d})\n", .{node.heading_level}),
.Blockquote => print("BLOCKQUOTE\n", .{}),
.UnorderedList => print("UNORDERED_LIST (Marker: '{c}', Tight: {s})\n", .{node.list_item_marker, if (node.tight_list) "yes" else "no"}),
.OrderedList => print("ORDERED_LIST (Start: {d}, Marker: '{c}', Tight: {s})\n", .{node.list_start_number, node.list_item_marker, if (node.tight_list) "yes" else "no"}),
.ListItem => print("LIST_ITEM (Marker: '{c}')\n", .{node.list_item_marker}),
.CodeBlock => {
print("CODE_BLOCK (Indented)\n", .{});
if (node.code_content) |cc| {
i = 0; while (i < indent_level + 1) : (i += 1) { print(" ", .{}); }
if (cc.len < 60) print("Content: [[{s}]]\n", .{cc})
else print("Content: (long)\n", .{});
} else { print ("Content: (null)\n", .{}); }
},
.FencedCodeBlock => {
print("FENCED_CODE_BLOCK (Lang: {?s})\n", .{node.code_language});
if (node.code_content) |cc| {
i = 0; while (i < indent_level + 1) : (i += 1) { print(" ", .{}); }
if (cc.len < 60) print("Content: [[{s}]]\n", .{cc})
else print("Content: (long)\n", .{});
} else { print ("Content: (null)\n", .{}); }
},
.HorizontalRule => print("HORIZONTAL_RULE\n", .{}),
.HtmlBlock => print("HTML_BLOCK: [[{?s}]]\n", .{node.text_content}),
.Text => print("TEXT: \"{?s}\"\n", .{node.text_content}),
.Emphasis => print("EMPHASIS\n", .{}),
.Strong => print("STRONG\n", .{}),
.Strikethrough => print("STRIKETHROUGH\n", .{}),
.InlineCode => print("INLINE_CODE: `{?s}`\n", .{node.text_content}),
.Link => print("LINK (URL: {?s}, Title: {?s})\n", .{node.url, node.title}),
.Image => print("IMAGE (Alt: {?s}, URL: {?s}, Title: {?s})\n", .{node.alt_text, node.url, node.title}),
.SoftBreak => print("SOFT_BREAK\n", .{}),
.HardBreak => print("HARD_BREAK\n", .{}),
.HtmlInline => print("HTML_INLINE: {?s}\n", .{node.text_content}),
.EscapedChar => print("ESCAPED_CHAR: \\{?s}\n", .{node.text_content}),
}
for (node.children.items) |child_node| {
printAstNode(child_node, indent_level + 1);
}
}
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const markdown_example =
\\# Welcome to Markdown
\\
\\This is a paragraph with *italic* and **bold** text.
\\And a line with two spaces at the end for a hard break.
\\Next line.
\\
\\Another paragraph with `inline code` and a [link](http://example.com "Optional Title").
\\An image: ![Alt text](/path/to/image.jpg "Image Title")
\\
\\> This is a blockquote.
\\> With multiple lines.
\\
\\And a lazy continuation
\\for the blockquote.
\\
\\## Sub Heading
\\
\\Setext L1
\\=========
\\
\\Setext L2
\\---------
\\
\\Indented code block:
\\
\\ int main() {
\\ printf("Hello");
\\ }
\\
\\Fenced code block:
\\```c
\\void func() {
\\ // comment
\\}
\\```
\\
\\Unordered List (tight):
\\- Item 1
\\ - Nested Item 1.1
\\ - Nested Item 1.2
\\- Item 2
\\ * With more indent
\\
\\Ordered List (tight):
\\1. First
\\2. Second
\\ 1) Nested first (marker change)
\\ 2) Nested second
\\3. Third
\\
\\* Loose list item 1
\\
\\* Loose list item 2
\\ This is content for item 2.
\\
\\ Still item 2, after blank line.
\\
\\* Loose list item 3
\\
\\Horizontal Rule:
\\---
\\
\\Escaped chars: \*hello\* \`code\` \[link\]
\\Inline HTML: <custom-tag attr="val">content</custom-tag> also <br/> this is text.
\\Autolink: <http://google.com>
\\Not an autolink: <[email protected]> (mailto: is required)
\\Autolink mail: <mailto:[email protected]>
;
print("Parsing Markdown:\n{s}\n", .{markdown_example});
var doc = try parseMarkdown(allocator, markdown_example);
defer doc.deinit();
print("\nAST Structure:\n", .{});
printAstNode(doc, 0);
}
test "basic markdown parsing" {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const md_text = "# Hello\n\nThis is *fun*.";
var doc = try parseMarkdown(allocator, md_text);
defer doc.deinit();
try std.testing.expectEqual(doc.node_type, MdNodeType.Document);
try std.testing.expectEqual(doc.children.items.len, 2);
const heading = doc.children.items[0];
try std.testing.expectEqual(heading.node_type, MdNodeType.Heading);
try std.testing.expectEqual(heading.heading_level, 1);
try std.testing.expectEqual(heading.children.items.len, 1);
const heading_text = heading.children.items[0];
try std.testing.expectEqual(heading_text.node_type, MdNodeType.Text);
try std.testing.expect(std.mem.eql(u8, heading_text.text_content.?, "Hello"));
const para = doc.children.items[1];
try std.testing.expectEqual(para.node_type, MdNodeType.Paragraph);
try std.testing.expectEqual(para.children.items.len, 3);
const para_text1 = para.children.items[0];
const para_emphasis = para.children.items[1];
const para_text2 = para.children.items[2];
try std.testing.expectEqual(para_text1.node_type, MdNodeType.Text);
try std.testing.expect(std.mem.eql(u8, para_text1.text_content.?, "This is "));
try std.testing.expectEqual(para_emphasis.node_type, MdNodeType.Emphasis);
try std.testing.expectEqual(para_emphasis.children.items.len, 1);
const emphasis_text = para_emphasis.children.items[0];
try std.testing.expectEqual(emphasis_text.node_type, MdNodeType.Text);
try std.testing.expect(std.mem.eql(u8, emphasis_text.text_content.?, "fun"));
try std.testing.expectEqual(para_text2.node_type, MdNodeType.Text);
try std.testing.expect(std.mem.eql(u8, para_text2.text_content.?, "."));
}
test "fenced code block lang and content" {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const md = "```zig\nconst x = 10;\n```";
var doc = try parseMarkdown(allocator, md);
defer doc.deinit();
try std.testing.expectEqual(doc.children.items.len, 1);
const fcb = doc.children.items[0];
try std.testing.expectEqual(fcb.node_type, .FencedCodeBlock);
try std.testing.expect(std.mem.eql(u8, fcb.code_language.?, "zig"));
try std.testing.expect(std.mem.eql(u8, fcb.code_content.?, "const x = 10;"));
const md_empty = "```\n\n```";
var doc_empty = try parseMarkdown(allocator, md_empty);
defer doc_empty.deinit();
const fcb_empty = doc_empty.children.items[0];
try std.testing.expectEqual(fcb_empty.node_type, .FencedCodeBlock);
try std.testing.expect(fcb_empty.code_language == null);
try std.testing.expect(std.mem.eql(u8, fcb_empty.code_content.?, ""));
const md_no_final_newline = "```\ntest```";
var doc_no_finalnl = try parseMarkdown(allocator, md_no_final_newline);
defer doc_no_finalnl.deinit();
const fcb_no_finalnl = doc_no_finalnl.children.items[0];
try std.testing.expect(std.mem.eql(u8, fcb_no_finalnl.code_content.?, "test"));
}
test "list parsing tight vs loose" {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const md_tight = "- item1\n- item2";
var doc_tight = try parseMarkdown(allocator, md_tight);
defer doc_tight.deinit();
try std.testing.expectEqual(doc_tight.children.items.len, 1);
const list_tight = doc_tight.children.items[0];
try std.testing.expectEqual(list_tight.node_type, .UnorderedList);
try std.testing.expect(list_tight.tight_list);
const md_loose_blank_between = "- item1\n\n- item2";
var doc_loose_bb = try parseMarkdown(allocator, md_loose_blank_between);
defer doc_loose_bb.deinit();
try std.testing.expectEqual(doc_loose_bb.children.items.len, 1);
const list_loose_bb = doc_loose_bb.children.items[0];
try std.testing.expectEqual(list_loose_bb.node_type, .UnorderedList);
try std.testing.expect(!list_loose_bb.tight_list);
const md_loose_internal_blank = "- item1\n para1\n\n para2\n- item2"; // Blank line inside item content
var doc_loose_ib = try parseMarkdown(allocator, md_loose_internal_blank);
defer doc_loose_ib.deinit();
const list_loose_ib = doc_loose_ib.children.items[0];
try std.testing.expectEqual(list_loose_ib.node_type, .UnorderedList);
try std.testing.expect(!list_loose_ib.tight_list);
try std.testing.expectEqual(list_loose_ib.children.items[0].children.items.len, 2);
const md_loose_ends_blank = "* item1\n\n"; // List item ends with blank line that's part of it
var doc_loose_eb = try parseMarkdown(allocator, md_loose_ends_blank);
defer doc_loose_eb.deinit();
const list_loose_eb = doc_loose_eb.children.items[0];
try std.testing.expectEqual(list_loose_eb.node_type, .UnorderedList);
// This case is subtle. CommonMark says: "A list is loose if any of its constituent list items are separated by blank lines,
// or if any of its constituent list items directly contain two block-level elements with a blank line between them."
// A single item ending in a blank line, if that blank line is *part of the item's content* (e.g. before an indented block), makes it loose.
// If it's just a blank line *after* the item, it doesn't. My parser might be too aggressive here.
// The test case `* item1\n\n* item2` correctly becomes loose due to blank between items.
// `* item1\n\n Indented block` also makes item1 loose, and thus the list.
// `* item1\n\nParagraph after list` - item1 is tight.
// Current logic might mark `md_loose_ends_blank` as loose, which might be okay or too strict depending on interpretation.
// For now, let's assume if an item *contains* a blank line that separates blocks or is kept as part of it, it's loose.
// The key is if the blank line is *consumed* by the item vs separating it from the next item/block.
try std.testing.expect(!list_loose_eb.tight_list);
}
// This is an implementation from scratch
const std = @import("std");
const Allocator = std.mem.Allocator;
const ArrayList = std.ArrayList;
const AutoHashMap = std.AutoHashMap;
const StringHashMap = std.StringHashMap;
const util = std.util;
const ascii = std.ascii;
const mem = std.mem;
const DEBUG = false; // Set to true for debug prints
fn dbgPrint(comptime fmt: []const u8, args: anytype) void {
if (DEBUG) {
std.debug.print(fmt, args);
}
}
const NodeType = enum {
Document,
Paragraph,
Heading,
Blockquote,
UnorderedList,
OrderedList,
ListItem,
CodeBlock, // Indented
FencedCodeBlock,
HorizontalRule,
HtmlBlock,
Text,
Emphasis,
Strong,
Strikethrough,
Link,
Image,
InlineCode,
RawHtml, // Inline HTML
LineBreak, // Hard line break
// LinkReferenceDefinition, // Not a visible node, stored in parser
};
const LinkRef = struct {
url: []const u8,
title: ?[]const u8,
};
const Node = struct {
type: NodeType,
children: ArrayList(Node),
content: ?[]const u8,
level: ?u8,
url: ?[]const u8,
title: ?[]const u8,
alt_text: ?[]const u8,
info_string: ?[]const u8,
start_number: ?u64,
tight: ?bool, // For lists
is_task_list_item: ?bool,
is_task_list_item_checked: ?bool,
allocator: Allocator,
pub fn init(allocator: Allocator, node_type: NodeType) Node {
return .{
.type = node_type,
.children = ArrayList(Node).init(allocator),
.content = null,
.level = null,
.url = null,
.title = null,
.alt_text = null,
.info_string = null,
.start_number = null,
.tight = null,
.is_task_list_item = null,
.is_task_list_item_checked = null,
.allocator = allocator,
};
}
pub fn deinit(self: *Node) void {
for (self.children.items) |*child| {
child.deinit();
}
self.children.deinit();
if (self.content) |c| self.allocator.free(c);
if (self.url) |u| self.allocator.free(u);
if (self.title) |t| self.allocator.free(t);
if (self.alt_text) |a| self.allocator.free(a);
if (self.info_string) |i| self.allocator.free(i);
}
pub fn appendChild(self: *Node, child: Node) !void {
try self.children.append(child);
}
pub fn print(self: Node, indent_level: usize) void {
var i: usize = 0;
while (i < indent_level) : (i += 1) {
std.debug.print(" ", .{});
}
std.debug.print("{any}", .{self.type});
if (self.content) |c| std.debug.print(" \"{s}\"", .{c});
if (self.level) |l| std.debug.print(" (L{d})", .{l});
if (self.url) |u| std.debug.print(" (url: \"{s}\")", .{u});
if (self.title) |t| std.debug.print(" (title: \"{s}\")", .{t});
if (self.alt_text) |a| std.debug.print(" (alt: \"{s}\")", .{a});
if (self.info_string) |is| std.debug.print(" (info: \"{s}\")", .{is});
if (self.start_number) |s| std.debug.print(" (start: {d})", .{s});
if (self.tight) |t| std.debug.print(" (tight: {any})", .{t});
if (self.is_task_list_item) |is_task| {
if (is_task) {
std.debug.print(" (task: {any})", .{self.is_task_list_item_checked});
}
}
std.debug.print("\n", .{});
for (self.children.items) |child| {
child.print(indent_level + 1);
}
}
};
fn countLeadingChars(slice: []const u8, char_to_count: u8) usize {
var count: usize = 0;
while (count < slice.len and slice[count] == char_to_count) : (count += 1) {}
return count;
}
fn trimAll(slice: []const u8, comptime_chars: []const u8) []const u8 {
return mem.trim(u8, slice, comptime_chars);
}
fn trimLine(line: []const u8) []const u8 {
return mem.trim(u8, line, " \t\r\n");
}
fn getIndentWidth(line: []const u8, tab_stop: usize) usize {
var width: usize = 0;
var i: usize = 0;
while (i < line.len) {
if (line[i] == ' ') {
width += 1;
} else if (line[i] == '\t') {
width += tab_stop - (width % tab_stop);
} else {
break;
}
i += 1;
}
return width;
}
fn stripIndent(line: []const u8, indent_to_strip: usize, tab_stop: usize) []const u8 {
var current_indent_stripped: usize = 0;
var i: usize = 0;
while (i < line.len and current_indent_stripped < indent_to_strip) {
if (line[i] == ' ') {
current_indent_stripped += 1;
i += 1;
} else if (line[i] == '\t') {
const spaces_for_tab = tab_stop - (current_indent_stripped % tab_stop);
current_indent_stripped += spaces_for_tab;
i += 1;
if (current_indent_stripped > indent_to_strip) {
// Stripped part of a tab that went beyond indent_to_strip
// This means we need to prepend spaces for the over-stripped part.
// This scenario is complex and usually avoided by ensuring indent_to_strip is a multiple of tab_stop or precise.
// For simplicity, we assume this doesn't happen often or handled by caller carefully.
// The common case is stripping N spaces.
// A better way is to convert leading tabs to spaces first.
// For now, this is a simplification.
break;
}
} else {
break;
}
}
return line[i..];
}
fn isBlankLine(line: []const u8) bool {
return trimLine(line).len == 0;
}
const TAB_STOP: usize = 4;
const Parser = struct {
allocator: Allocator,
input_lines: ArrayList([]const u8),
current_line_idx: usize,
link_references: StringHashMap(LinkRef),
recursion_depth: usize, // To prevent deep recursion in list/blockquote parsing
const MAX_RECURSION_DEPTH = 64; // Arbitrary limit
pub fn init(allocator: Allocator, markdown_text: []const u8) !Parser {
var p = Parser{
.allocator = allocator,
.input_lines = ArrayList([]const u8).init(allocator),
.current_line_idx = 0,
.link_references = StringHashMap(LinkRef).init(allocator),
.recursion_depth = 0,
};
try p.preprocessAndSplitLines(markdown_text);
return p;
}
pub fn deinit(self: *Parser) void {
for (self.input_lines.items) |line| {
// Lines are slices of original input or allocated during preprocessing.
// If allocated, they need to be freed. Assuming for now they are slices of a single block.
// If preprocessAndSplitLines allocates, it needs to be tracked.
// For now, assume lines are from a single alloc that Parser will free, or slices of original input.
// Let's make preprocessAndSplitLines allocate a single buffer and lines slice into it.
// OR input_lines owns its strings if they were modified.
// Easiest for now: if preprocessing creates new strings, it dupe()s them.
// The current `preprocessAndSplitLines` creates owned lines.
self.allocator.free(line);
}
self.input_lines.deinit();
var iter = self.link_references.valueIterator();
while (iter.next()) |link_ref| {
self.allocator.free(link_ref.url);
if (link_ref.title) |t| self.allocator.free(t);
}
self.link_references.deinit();
}
fn preprocessAndSplitLines(self: *Parser, original_input: []const u8) !void {
var line_start: usize = 0;
var i: usize = 0;
var temp_line_buf = ArrayList(u8).init(self.allocator);
defer temp_line_buf.deinit();
while (i < original_input.len) {
const char = original_input[i];
if (char == '\r') {
if (i + 1 < original_input.len and original_input[i + 1] == '\n') {
// CR LF
try temp_line_buf.appendSlice(original_input[line_start..i]);
try temp_line_buf.append('\n');
try self.input_lines.append(try temp_line_buf.toOwnedSlice());
temp_line_buf.clearRetainingCapacity();
i += 1; // consume LF
line_start = i + 1;
} else {
// CR
try temp_line_buf.appendSlice(original_input[line_start..i]);
try temp_line_buf.append('\n');
try self.input_lines.append(try temp_line_buf.toOwnedSlice());
temp_line_buf.clearRetainingCapacity();
line_start = i + 1;
}
} else if (char == '\n') {
// LF
try temp_line_buf.appendSlice(original_input[line_start .. i + 1]);
try self.input_lines.append(try temp_line_buf.toOwnedSlice());
temp_line_buf.clearRetainingCapacity();
line_start = i + 1;
}
i += 1;
}
if (line_start < original_input.len) {
try temp_line_buf.appendSlice(original_input[line_start..]);
try self.input_lines.append(try temp_line_buf.toOwnedSlice()); // No final newline if not present
temp_line_buf.clearRetainingCapacity();
}
}
fn currentLine(self: *const Parser) ?[]const u8 {
if (self.current_line_idx >= self.input_lines.items.len) return null;
return self.input_lines.items[self.current_line_idx];
}
fn peekLine(self: *const Parser, offset: usize) ?[]const u8 {
if (self.current_line_idx + offset >= self.input_lines.items.len) return null;
return self.input_lines.items[self.current_line_idx + offset];
}
fn advanceLine(self: *Parser) void {
if (self.current_line_idx < self.input_lines.items.len) {
self.current_line_idx += 1;
}
}
fn consumeCurrentLine(self: *Parser) ?[]const u8 {
const line = self.currentLine();
self.advanceLine();
return line;
}
fn isEof(self: *const Parser) bool {
return self.current_line_idx >= self.input_lines.items.len;
}
fn parseDocument(self: *Parser) !Node {
var document = Node.init(self.allocator, .Document);
errdefer document.deinit();
while (!self.isEof()) {
const line = self.currentLine().?;
const indent = getIndentWidth(line, TAB_STOP);
const content_line = stripIndent(line, indent, TAB_STOP);
if (self.tryParseBlankLines(&document)) continue;
if (try self.tryParseLinkReferenceDefinition(&document)) continue;
if (try self.tryParseThematicBreak(&document)) continue;
if (try self.tryParseAtxHeading(&document)) continue;
if (try self.tryParseFencedCodeBlock(&document)) continue;
if (try self.tryParseHtmlBlock(&document)) continue; // TODO
if (try self.tryParseBlockquote(&document)) continue;
if (try self.tryParseUnorderedList(&document, indent)) continue;
if (try self.tryParseOrderedList(&document, indent)) continue;
// Indented code block must not interrupt a paragraph.
// If the last block was not a paragraph, or if we are at the top level (no last block),
// an indented line could be an indented code block.
var last_block_was_paragraph = false;
if (document.children.items.len > 0) {
if (document.children.items[document.children.items.len - 1].type == .Paragraph) {
last_block_was_paragraph = true;
}
}
if (indent >= TAB_STOP and !isBlankLine(line) and !last_block_was_paragraph) {
if (try self.tryParseIndentedCodeBlock(&document)) continue;
}
// Paragraphs and Setext Headings
// This is the fallback.
if (try self.tryParseParagraphOrSetext(&document)) continue;
// If nothing matched (should be rare, paragraph is a good fallback)
// This could happen if a line is e.g. " foo" and not caught by indented code block logic
// or if it's some construct not yet supported that doesn't form a paragraph start.
// For safety, consume the line to avoid infinite loop.
dbgPrint("Warning: Unparsed line: {s}\n", .{line});
self.advanceLine();
}
return document;
}
// --- Block Parsers ---
fn tryParseBlankLines(self: *Parser, parent_node: *Node) bool {
_ = parent_node; // Not used yet, might be for tight list determination
var consumed = false;
while (self.currentLine()) |line| {
if (isBlankLine(line)) {
self.advanceLine();
consumed = true;
} else {
break;
}
}
return consumed;
}
fn tryParseLinkReferenceDefinition(self: *Parser, _: *Node) !bool {
const line_start_idx = self.current_line_idx;
var line = self.currentLine() orelse return false;
var i = getIndentWidth(line, TAB_STOP);
if (i >= TAB_STOP) return false; // Max 3 spaces indent
const line_after_indent = stripIndent(line, i, TAB_STOP);
if (line_after_indent.len == 0 or line_after_indent[0] != '[') return false;
var end_label_idx: ?usize = null;
var balance: usize = 0;
for (line_after_indent, 0..) |char, idx| {
if (char == '[') balance += 1;
else if (char == ']') {
balance -= 1;
if (balance == 0) {
end_label_idx = idx;
break;
}
} else if (char == '\n' or char == '\r') { // Label cannot span lines
return false;
}
}
if (end_label_idx == null) return false;
const label_end = end_label_idx.? ;
if (label_end == 0) return false; // Empty label: `[]:`
const label_raw = line_after_indent[1..label_end];
if (isBlankLine(label_raw)) return false; // Label consists of only whitespace
const label = try self.allocator.dupe(u8, mem.trim(u8, label_raw, " \t\r\n")); // Normalize label
defer if(label.len == 0) self.allocator.free(label); // If label becomes empty after trim
if (label.len == 0) return false;
i = label_end + 1;
if (i >= line_after_indent.len or line_after_indent[i] != ':') {
self.allocator.free(label);
return false;
}
i += 1; // Skip ':'
// Skip optional whitespace before URL
while (i < line_after_indent.len and ascii.isWhitespace(line_after_indent[i]) and line_after_indent[i] != '\n') : (i += 1) {}
if (i >= line_after_indent.len or line_after_indent[i] == '\n') { // No URL
self.allocator.free(label);
return false;
}
var url_start = i;
var url_end = i;
var dest: []const u8 = "";
var title_str: ?[]const u8 = null;
if (line_after_indent[url_start] == '<') { // URL in <>
url_start += 1;
var url_balance = 1;
url_end = url_start;
while (url_end < line_after_indent.len) {
if (line_after_indent[url_end] == '<') url_balance += 1;
else if (line_after_indent[url_end] == '>') {
url_balance -= 1;
if (url_balance == 0) break;
} else if (line_after_indent[url_end] == '\n' or line_after_indent[url_end] == '\\') { // Unescaped newline or backslash in <url> not allowed
self.allocator.free(label);
return false;
}
url_end += 1;
}
if (url_balance != 0) { self.allocator.free(label); return false; } // Mismatched <>
dest = line_after_indent[url_start..url_end];
url_end += 1; // past '>'
} else { // URL not in <>
url_end = url_start;
var paren_balance = 0;
while (url_end < line_after_indent.len) {
const c = line_after_indent[url_end];
if (ascii.isWhitespace(c)) break;
if (c == '(') paren_balance += 1;
else if (c == ')') {
if (paren_balance == 0) break; // CommonMark: unescaped ) not allowed if not balanced. This is simpler.
paren_balance -= 1;
} else if (c == '\\' and url_end + 1 < line_after_indent.len) { // Escaped char
url_end += 1;
}
url_end += 1;
}
if (url_start == url_end) { self.allocator.free(label); return false; } // Empty URL
dest = line_after_indent[url_start..url_end];
}
// TODO: Unescape URL: dest = unescape_string(dest)
i = url_end;
const ws_before_title_start = i;
while (i < line_after_indent.len and ascii.isWhitespace(line_after_indent[i]) and line_after_indent[i] != '\n') : (i += 1) {}
const ws_before_title_end = i;
if (i < line_after_indent.len and line_after_indent[i] != '\n') { // Potential title
const title_char = line_after_indent[i];
if (title_char == '"' or title_char == '\'' or title_char == '(') {
const closing_char = switch (title_char) {
'"' => '"',
'\'' => '\'',
'(' => ')',
else => unreachable,
};
const title_content_start = i + 1;
var title_content_end = title_content_start;
var found_closing = false;
while(title_content_end < line_after_indent.len) {
if (line_after_indent[title_content_end] == '\\' and title_content_end + 1 < line_after_indent.len) {
title_content_end += 2; // Skip escaped char
continue;
}
if (line_after_indent[title_content_end] == closing_char) {
found_closing = true;
break;
}
if (line_after_indent[title_content_end] == '\n') break; // Title cannot span lines implicitly
title_content_end += 1;
}
if (found_closing) {
// Check if anything else on the line after title
var j = title_content_end + 1;
while(j < line_after_indent.len and ascii.isWhitespace(line_after_indent[j]) and line_after_indent[j] != '\n') : (j+=1) {}
if (j < line_after_indent.len and line_after_indent[j] != '\n') { // Junk after title
// This is not a valid title, so the URL part extends to ws_before_title_start or ws_before_title_end
// For simplicity, we assume this means no title.
} else {
// Valid title found
const raw_title = line_after_indent[title_content_start..title_content_end];
// TODO: unescape title: title_str = unescape_string(raw_title)
title_str = try self.allocator.dupe(u8, raw_title);
i = title_content_end + 1; // Advance past closing quote/paren
}
}
}
// If no valid title was parsed, the characters might be part of a multi-line URL or just junk.
// CommonMark allows URL and title to span multiple lines if subsequent lines are blank or less indented.
// This simplified parser only considers single-line definitions.
}
// Check for junk after definition
while (i < line_after_indent.len and ascii.isWhitespace(line_after_indent[i]) and line_after_indent[i] != '\n') : (i += 1) {}
if (i < line_after_indent.len and line_after_indent[i] != '\n') { // Junk after definition
self.allocator.free(label);
if (title_str) |ts| self.allocator.free(ts);
return false;
}
// Successfully parsed a link reference definition
const final_url = try self.allocator.dupe(u8, dest);
const link_ref_val = LinkRef{ .url = final_url, .title = title_str };
// Normalize label for map key: lowercase, collapse internal whitespace to single space
// For now, use the trimmed label directly. Proper normalization is complex.
const normalized_key_label = try self.normalizeLabelForMap(label); // This also allocates
self.allocator.free(label); // Free the original dupe'd label
// Only add if not already present (first one wins)
if (!self.link_references.contains(normalized_key_label)) {
try self.link_references.put(normalized_key_label, link_ref_val);
} else {
// Duplicate label, ignore this one. Free resources.
self.allocator.free(normalized_key_label);
self.allocator.free(final_url);
if (title_str) |ts| self.allocator.free(ts);
}
self.current_line_idx = line_start_idx; // Reset for advanceLine
self.advanceLine();
return true;
}
fn normalizeLabelForMap(self: *Parser, label: []const u8) ![]const u8 {
// Simple normalization: lowercase and trim.
// True CommonMark normalization: collapse whitespace sequences to single space.
var buf = ArrayList(u8).init(self.allocator);
defer if(buf.items.len == 0) buf.deinit() else {}; // deinit if toOwnedSlice fails or not used
var last_was_space = true; // To collapse multiple spaces
for (label) |c| {
if (ascii.isWhitespace(c)) {
if (!last_was_space) {
try buf.append(' ');
last_was_space = true;
}
} else {
try buf.append(ascii.toLower(c));
last_was_space = false;
}
}
// Remove trailing space if any
if (buf.items.len > 0 and buf.items[buf.items.len - 1] == ' ') {
_ = buf.pop();
}
return buf.toOwnedSlice();
}
fn tryParseThematicBreak(self: *Parser, parent_node: *Node) !bool {
var line = self.currentLine() orelse return false;
const indent = getIndentWidth(line, TAB_STOP);
if (indent >= TAB_STOP) return false;
const content = stripIndent(line, indent, TAB_STOP);
var marker_char: u8 = 0;
var count: usize = 0;
var non_marker_found = false;
for (content) |char| {
if (char == '*' or char == '-' or char == '_') {
if (marker_char == 0) {
marker_char = char;
} else if (char != marker_char) {
non_marker_found = true;
break;
}
count += 1;
} else if (char == ' ' or char == '\t') {
// spaces are allowed
} else if (char == '\n' or char == '\r') {
// end of line
break;
} else {
non_marker_found = true;
break;
}
}
if (!non_marker_found and count >= 3) {
var hr_node = Node.init(self.allocator, .HorizontalRule);
errdefer hr_node.deinit();
try parent_node.appendChild(hr_node);
self.advanceLine();
return true;
}
return false;
}
fn tryParseAtxHeading(self: *Parser, parent_node: *Node) !bool {
var line = self.currentLine() orelse return false;
const indent = getIndentWidth(line, TAB_STOP);
if (indent >= TAB_STOP) return false;
const content = stripIndent(line, indent, TAB_STOP);
if (content.len == 0 or content[0] != '#') return false;
var level: usize = 0;
while (level < content.len and content[level] == '#') : (level += 1) {}
if (level == 0 or level > 6) return false;
var text_start = level;
// There must be a space after #s, unless line is only #s
if (text_start < content.len and content[text_start] != ' ' and content[text_start] != '\t' and content[text_start] != '\n' and content[text_start] != '\r') {
// Exception: "###foo" is not a heading but "### foo" is. However "###" is.
// If there's content after hashes, it must be preceded by space.
// If hashes are followed immediately by non-whitespace, it's not a heading, unless it's EOL.
var all_hashes = true;
for(content[text_start..]) |c| {
if (c != ' ' and c != '\t' and c != '\n' and c != '\r') {
all_hashes = false;
break;
}
}
if (!all_hashes) return false;
}
while (text_start < content.len and (content[text_start] == ' ' or content[text_start] == '\t')) : (text_start += 1) {}
var text_end = content.len;
// Trim trailing hashes if they are preceded by space
// e.g. "## foo ##" -> "foo"
// "## foo#bar ##" -> "foo#bar" (inner # is part of content)
var temp_end = content.len;
while (temp_end > text_start) { // Go from right to left
const c = content[temp_end - 1];
if (c == ' ' or c == '\t' or c == '\n' or c == '\r') {
temp_end -= 1;
} else if (c == '#') {
// Check if all remaining chars are '#' or space before it
var can_strip_trailing_hashes = true;
var space_before_hashes = false;
var k = text_start;
var first_hash_group_end = text_start; // end of leading hashes
while(k < temp_end and content[k] == '#') : (k+=1);
first_hash_group_end = k;
while(k < temp_end and (content[k] == ' ' or content[k] == '\t')) : (k+=1); // space after leading hashes
var actual_text_end = temp_end;
var m = temp_end -1;
while (m >= k && content[m] == '#') : (m -=1);
// Check if char before this sequence of trailing # is a space
if (m >= k && (content[m] == ' ' or content[m] == '\t')) {
// Potential valid trailing hashes
text_end = m; // Point before the space before trailing hashes
while (text_end > text_start && (content[text_end-1] == ' ' or content[text_end-1] == '\t')) {
text_end -=1; // Trim spaces before the stripped hashes
}
} else {
// No space before hashes, or hashes are adjacent to text, so not strippable
}
break; // Processed trailing hashes once
} else {
break; // Not a space or hash, so content ends here
}
}
// Final trim of any remaining trailing spaces from content
while (text_end > text_start and (content[text_end - 1] == ' ' or content[text_end - 1] == '\t' or content[text_end-1] == '\n' or content[text_end-1] == '\r')) {
text_end -= 1;
}
const heading_text_slice = content[text_start..text_end];
var heading_node = Node.init(self.allocator, .Heading);
errdefer heading_node.deinit();
heading_node.level = @intCast(u8, level);
try self.parseInlines(heading_text_slice, &heading_node, . {});
try parent_node.appendChild(heading_node);
self.advanceLine();
return true;
}
fn tryParseFencedCodeBlock(self: *Parser, parent_node: *Node) !bool {
var line = self.currentLine() orelse return false;
const initial_line_idx = self.current_line_idx;
const indent = getIndentWidth(line, TAB_STOP);
if (indent >= TAB_STOP) return false;
const content_after_indent = stripIndent(line, indent, TAB_STOP);
if (content_after_indent.len < 3) return false;
const fence_char = content_after_indent[0];
if (fence_char != '`' and fence_char != '~') return false;
var fence_len: usize = 0;
while (fence_len < content_after_indent.len and content_after_indent[fence_len] == fence_char) : (fence_len += 1) {}
if (fence_len < 3) return false;
// Info string
const info_string_raw = mem.trim(u8, content_after_indent[fence_len..], " \t\r\n");
// Info string cannot contain backticks if fence is backtick
if (fence_char == '`' and mem.indexOfScalar(u8, info_string_raw, '`') != null) {
return false;
}
const info_string = if (info_string_raw.len > 0) try self.allocator.dupe(u8, info_string_raw) else null;
errdefer if (info_string) |is| self.allocator.free(is);
self.advanceLine(); // Consume opening fence line
var code_lines = ArrayList([]const u8).init(self.allocator);
defer {
for (code_lines.items) |cl| self.allocator.free(cl); // if dupe'd
code_lines.deinit();
}
while (self.currentLine()) |current_content_line| {
const current_content_indent = getIndentWidth(current_content_line, TAB_STOP);
const line_after_closing_indent = stripIndent(current_content_line, current_content_indent, TAB_STOP);
var is_closing_fence = true;
if (current_content_indent < indent) { // Closing fence can be less indented up to original line indent
// This rule is complex. Simpler: closing fence must match or exceed opening indent,
// and then be stripped relative to that.
// CommonMark: "The closing code fence must be at least as long as the opening fence"
// "and its indent must be less than 4 spaces" (relative to what? the line itself)
// "It need not be flush left or indented the same number of spaces as the opening fence."
// The crucial part is `stripIndent(current_content_line, current_content_indent, TAB_STOP)` handles this.
// The indent of the closing fence line itself must be < TAB_STOP.
// But the content of it must match the fence.
if (getIndentWidth(current_content_line, TAB_STOP) >= TAB_STOP) {
is_closing_fence = false;
}
}
if (is_closing_fence and line_after_closing_indent.len >= fence_len and line_after_closing_indent[0] == fence_char) {
var closing_fence_len: usize = 0;
while (closing_fence_len < line_after_closing_indent.len and line_after_closing_indent[closing_fence_len] == fence_char) : (closing_fence_len += 1) {}
if (closing_fence_len >= fence_len) {
// Check that rest of line is whitespace
const after_closing_fence = mem.trim(u8, line_after_closing_indent[closing_fence_len..], " \t\r\n");
if (after_closing_fence.len == 0) {
self.advanceLine(); // Consume closing fence
// Construct the node
var code_block_node = Node.init(self.allocator, .FencedCodeBlock);
errdefer code_block_node.deinit();
code_block_node.info_string = info_string; // transfer ownership
var full_code_content = ArrayList(u8).init(self.allocator);
defer full_code_content.deinit();
for (code_lines.items, 0..) |code_line, line_idx| {
try full_code_content.appendSlice(code_line);
// Newline was part of the stored line if preprocessAndSplitLines ensures it. Yes.
}
code_block_node.content = try full_code_content.toOwnedSlice();
errdefer if(code_block_node.content) |c| self.allocator.free(c);
try parent_node.appendChild(code_block_node);
return true;
}
}
}
// Not a closing fence, add to code lines.
// CommonMark: "The content of a code block is the text between the opening and closing fences,
// minus spaces used for indentation (if any), and minus the final line ending (if any)."
// Lines are unindented up to the indent of the opening fence line.
const unindented_code_line = stripIndent(current_content_line, indent, TAB_STOP);
try code_lines.append(try self.allocator.dupe(u8, unindented_code_line));
self.advanceLine();
}
// Reached EOF without closing fence. Treat as literal text.
// Rewind and return false.
if (info_string) |is| self.allocator.free(is); // free if allocated
self.current_line_idx = initial_line_idx;
return false;
}
fn tryParseIndentedCodeBlock(self: *Parser, parent_node: *Node) !bool {
// This is called when we've already determined it's likely an indented code block starter.
// (i.e., indent >= TAB_STOP, not blank, and not paragraph continuation context)
const initial_line_idx = self.current_line_idx;
var code_content_buf = ArrayList(u8).init(self.allocator);
defer code_content_buf.deinit();
var consecutive_blank_lines_count: usize = 0;
var first_line = true;
while (self.currentLine()) |line| {
if (isBlankLine(line)) {
// Keep blank lines if they are part of the code block (i.e. not trailing)
// CommonMark: trailing blank lines are excluded.
// Any number of initial blank lines are ignored.
if (!first_line) { // Only count if not initial blank lines
try code_content_buf.appendSlice(try self.allocator.dupe(u8, "\n")); // Represent blank line
consecutive_blank_lines_count +=1;
}
self.advanceLine();
continue;
}
const indent = getIndentWidth(line, TAB_STOP);
if (indent < TAB_STOP) {
// Line is not indented enough, code block ends.
break;
}
// Reset blank line counter if we have a non-blank indented line
consecutive_blank_lines_count = 0;
first_line = false;
// Add the line content, stripped of TAB_STOP indentation
const code_part = stripIndent(line, TAB_STOP, TAB_STOP);
try code_content_buf.appendSlice(code_part); // Assumes line includes its newline
self.advanceLine();
}
if (first_line) { // No non-blank lines found, so not a code block
self.current_line_idx = initial_line_idx;
return false;
}
// Trim trailing blank lines that were added as `\n`
var final_len = code_content_buf.items.len;
while (final_len > 0 and consecutive_blank_lines_count > 0) {
if (final_len > 0 and code_content_buf.items[final_len-1] == '\n') {
final_len -=1;
consecutive_blank_lines_count -=1;
} else break;
}
code_content_buf.shrinkRetainingCapacity(final_len);
var code_block_node = Node.init(self.allocator, .CodeBlock);
errdefer code_block_node.deinit();
code_block_node.content = try code_content_buf.toOwnedSlice(); // Already has newlines
errdefer if (code_block_node.content) |c| self.allocator.free(c);
try parent_node.appendChild(code_block_node);
return true;
}
fn tryParseParagraphOrSetext(self: *Parser, parent_node: *Node) !bool {
const initial_line_idx = self.current_line_idx;
var paragraph_lines = ArrayList([]const u8).init(self.allocator);
defer {
for(paragraph_lines.items) |l| self.allocator.free(l);
paragraph_lines.deinit();
}
var paragraph_content_buf = ArrayList(u8).init(self.allocator);
defer paragraph_content_buf.deinit();
var first_line = true;
var potential_setext_level: u8 = 0;
while (self.currentLine()) |line| {
if (isBlankLine(line)) {
break; // Blank line ends a paragraph
}
// Check if this line starts a new block type that interrupts paragraphs
if (!first_line) { // First line of a paragraph can be many things
const indent = getIndentWidth(line, TAB_STOP);
const line_after_indent = stripIndent(line, indent, TAB_STOP);
if (indent < TAB_STOP) { // Only check for interrupting blocks if not significantly indented (part of para)
if (isThematicBreakStart(line_after_indent)) break;
if (isAtxHeadingStart(line_after_indent)) break;
if (isFencedCodeBlockStart(line_after_indent)) break;
if (isBlockquoteStart(line_after_indent)) break;
if (isListItemStart(line_after_indent) != .NotListItem) break;
// HTML block, Link Ref Def already checked usually.
}
}
// Check for Setext underline on the *next* line
if (self.peekLine(1)) |next_line_peek| {
const setext_level = getSetextUnderlineLevel(next_line_peek);
if (setext_level > 0) {
// This is a Setext heading. The current line is its text.
// The paragraph_lines collected so far + current line = heading text.
try paragraph_lines.append(try self.allocator.dupe(u8, line));
self.advanceLine(); // Consume current text line
self.advanceLine(); // Consume underline line
potential_setext_level = setext_level;
break; // Found Setext, exit loop.
}
}
// If not Setext, add line to paragraph
try paragraph_lines.append(try self.allocator.dupe(u8, line));
self.advanceLine();
first_line = false;
}
if (paragraph_lines.items.len == 0) {
// No lines collected, not a paragraph or setext. Rewind if lines were consumed by peeking logic (not the case here)
// self.current_line_idx = initial_line_idx; // Should be already correct or advanced by other parsers
return false;
}
// Join lines for content, trimming leading/trailing whitespace from each line and joining with space
for (paragraph_lines.items, 0..) |p_line, idx| {
const trimmed_line_content = trimLine(p_line);
try paragraph_content_buf.appendSlice(trimmed_line_content);
if (idx < paragraph_lines.items.len - 1) {
try paragraph_content_buf.append(' '); // Soft line break represented by space
}
}
const combined_text = mem.trim(u8, paragraph_content_buf.items, " "); // Trim final result
if (combined_text.len == 0) { // All lines were whitespace, or became empty
// This path means lines were consumed but resulted in no content.
// This is fine, effectively consumes whitespace lines not caught by tryParseBlankLines.
return true; // Consumed lines that formed an empty paragraph/heading
}
if (potential_setext_level > 0) {
var heading_node = Node.init(self.allocator, .Heading);
errdefer heading_node.deinit();
heading_node.level = potential_setext_level;
try self.parseInlines(combined_text, &heading_node, .{});
try parent_node.appendChild(heading_node);
} else {
var para_node = Node.init(self.allocator, .Paragraph);
errdefer para_node.deinit();
try self.parseInlines(combined_text, ¶_node, .{});
// Do not add empty paragraphs (e.g. only spaces, or if parseInlines yields nothing)
if (para_node.children.items.len > 0) {
try parent_node.appendChild(para_node);
} else {
para_node.deinit(); // Deinit if it would be empty
}
}
return true;
}
// --- Inline Parsing ---
// Simplified inline parsing. A full CommonMark inline parser is much more complex.
// This version handles basic cases without sophisticated delimiter stack.
const InlineContext = struct {
// Future: allow_links: bool, allow_emphasis: bool, etc.
};
fn parseInlines(self: *Parser, text: []const u8, parent_node: *Node, context: InlineContext) !void {
_ = context; // Not used yet
var current_pos: usize = 0;
var text_segment_start: usize = current_pos;
const flushTextSegment = () !void {
if (text_segment_start < current_pos) {
var text_node = Node.init(self.allocator, .Text);
text_node.content = try self.allocator.dupe(u8, text[text_segment_start..current_pos]);
errdefer text_node.deinit();
try parent_node.appendChild(text_node);
}
text_segment_start = current_pos; // Reset for next segment
};
while (current_pos < text.len) {
const char = text[current_pos];
var consumed_by_special_parser = false;
// 1. Escaped characters
if (char == '\\' and current_pos + 1 < text.len and isMarkdownPunctuation(text[current_pos + 1])) {
try flushTextSegment();
var escaped_text_node = Node.init(self.allocator, .Text);
escaped_text_node.content = try self.allocator.dupe(u8, text[current_pos + 1 .. current_pos + 2]);
errdefer escaped_text_node.deinit();
try parent_node.appendChild(escaped_text_node);
current_pos += 2;
text_segment_start = current_pos;
consumed_by_special_parser = true;
}
// 2. Inline Code Spans: `code`
else if (char == '`') {
const start_ticks_len = countLeadingChars(text[current_pos..], '`');
var end_ticks_pos: ?usize = null;
var search_pos = current_pos + start_ticks_len;
while(search_pos < text.len) {
if (text[search_pos] == '`') {
const current_end_ticks_len = countLeadingChars(text[search_pos..], '`');
if (current_end_ticks_len == start_ticks_len) {
end_ticks_pos = search_pos;
break;
}
search_pos += current_end_ticks_len;
} else {
search_pos +=1;
}
}
if (end_ticks_pos) |end_pos| {
try flushTextSegment();
var code_node = Node.init(self.allocator, .InlineCode);
const code_content_start = current_pos + start_ticks_len;
const code_content_end = end_pos;
var code_content_slice = text[code_content_start..code_content_end];
// CommonMark: "strip one space from beginning and end if surrounded by space and not all spaces"
if (code_content_slice.len > 1 and code_content_slice[0] == ' ' and code_content_slice[code_content_slice.len-1] == ' ' ) {
var all_spaces = true;
for(code_content_slice[1..code_content_slice.len-1]) |cc| if(cc != ' ') {all_spaces = false; break;}
if (!all_spaces) {
code_content_slice = code_content_slice[1..code_content_slice.len-1];
}
}
code_node.content = try self.allocator.dupe(u8, code_content_slice);
errdefer code_node.deinit();
try parent_node.appendChild(code_node);
current_pos = end_pos + start_ticks_len;
text_segment_start = current_pos;
consumed_by_special_parser = true;
}
// If no closing ticks, '`' is literal. Falls through.
}
// 3. Emphasis/Strong: *italic*, **bold**, _italic_, __bold__ (simplified)
// Strikethrough: ~~delete~~
else if (char == '*' or char == '_' or char == '~') {
const marker_char = char;
var marker_len: usize = 1;
if (current_pos + 1 < text.len and text[current_pos + 1] == marker_char) {
if (marker_char == '~') { // only ~~ for strikethrough
marker_len = 2;
} else if (marker_char == '*' or marker_char == '_') { // ** or __ for strong
marker_len = 2;
}
}
// Try longest first (strong/strikethrough)
if (marker_len == 2) {
const end_marker = text[current_pos .. current_pos + 2];
if (findAndParseEmphasisLike(self, text, current_pos, end_marker, parent_node, &text_segment_start, ¤t_pos)) {
consumed_by_special_parser = true;
} else if (marker_char == '*' or marker_char == '_') { // Fallback to single marker if double failed or not applicable
marker_len = 1; // Retry with single marker
const single_end_marker = text[current_pos .. current_pos + 1];
if (findAndParseEmphasisLike(self, text, current_pos, single_end_marker, parent_node, &text_segment_start, ¤t_pos)) {
consumed_by_special_parser = true;
}
}
} else if (marker_len == 1 and (marker_char == '*' or marker_char == '_')) { // Single marker emphasis
const end_marker = text[current_pos .. current_pos + 1];
if (findAndParseEmphasisLike(self, text, current_pos, end_marker, parent_node, &text_segment_start, ¤t_pos)) {
consumed_by_special_parser = true;
}
}
}
// 4. Links and Images: [text](url "title"), ![alt](src "title")
else if (char == '[' or (char == '!' and current_pos + 1 < text.len and text[current_pos + 1] == '[')) {
const is_image = (char == '!');
const text_start_bracket = if (is_image) current_pos + 1 else current_pos;
// Find closing ']' for link text / alt text
var text_end_bracket: ?usize = null;
var bracket_balance: usize = 1;
var scan_pos = text_start_bracket + 1;
while(scan_pos < text.len) {
if (text[scan_pos] == '\\' and scan_pos + 1 < text.len) { // escaped char
scan_pos += 2;
continue;
}
if (text[scan_pos] == '[') bracket_balance += 1;
else if (text[scan_pos] == ']') {
bracket_balance -= 1;
if (bracket_balance == 0) {
text_end_bracket = scan_pos;
break;
}
}
scan_pos += 1;
}
if (text_end_bracket) |text_end_idx| {
const link_text_slice = text[text_start_bracket + 1 .. text_end_idx];
scan_pos = text_end_idx + 1;
// Check for inline link: (url "title")
if (scan_pos < text.len and text[scan_pos] == '(') {
const url_part_start = scan_pos + 1;
var url_part_end: ?usize = null;
var paren_balance: usize = 1;
scan_pos = url_part_start;
while(scan_pos < text.len) {
if (text[scan_pos] == '\\' and scan_pos + 1 < text.len) {
scan_pos += 2; continue;
}
if (text[scan_pos] == '(') paren_balance += 1;
else if (text[scan_pos] == ')') {
paren_balance -= 1;
if (paren_balance == 0) {
url_part_end = scan_pos;
break;
}
}
scan_pos += 1;
}
if (url_part_end) |url_p_end| {
try flushTextSegment();
const url_title_content = text[url_part_start..url_p_end];
var dest_url: []const u8 = "";
var link_title_val: ?[]const u8 = null;
// Parse destination and title from url_title_content
var content_scan_pos: usize = 0;
// Skip leading whitespace in ()
while(content_scan_pos < url_title_content.len and ascii.isWhitespace(url_title_content[content_scan_pos])) : (content_scan_pos += 1) {}
const dest_start = content_scan_pos;
var dest_end = dest_start;
if (dest_start < url_title_content.len && url_title_content[dest_start] == '<') { // <url>
dest_end = dest_start + 1;
while(dest_end < url_title_content.len && url_title_content[dest_end] != '>') {
if (url_title_content[dest_end] == '\\' && dest_end + 1 < url_title_content.len) dest_end +=1; // skip escaped
dest_end +=1;
}
if (dest_end < url_title_content.len && url_title_content[dest_end] == '>') {
dest_url = url_title_content[dest_start+1..dest_end];
dest_end +=1; // past '>'
} else { // malformed <url>, treat as non-angled
dest_end = dest_start; // reset
while(dest_end < url_title_content.len and !ascii.isWhitespace(url_title_content[dest_end])) { // up to whitespace or title
if (url_title_content[dest_end] == '(' || url_title_content[dest_end] == ')') break; // Stop if it's a paren for balance reasons
if (url_title_content[dest_end] == '\\' && dest_end + 1 < url_title_content.len) dest_end +=1;
dest_end +=1;
}
dest_url = url_title_content[dest_start..dest_end];
}
} else { // bare url
while(dest_end < url_title_content.len and !ascii.isWhitespace(url_title_content[dest_end])) {
if (url_title_content[dest_end] == '(' || url_title_content[dest_end] == ')') break;
if (url_title_content[dest_end] == '\\' && dest_end + 1 < url_title_content.len) dest_end +=1;
dest_end +=1;
}
dest_url = url_title_content[dest_start..dest_end];
}
content_scan_pos = dest_end;
// Skip whitespace between URL and title
while(content_scan_pos < url_title_content.len and ascii.isWhitespace(url_title_content[content_scan_pos])) : (content_scan_pos += 1) {}
if (content_scan_pos < url_title_content.len) { // Potential title
const title_q_char = url_title_content[content_scan_pos];
if (title_q_char == '"' or title_q_char == '\'' or title_q_char == '(') {
const title_closing_char = if (title_q_char == '(') ')' else title_q_char;
const title_text_start = content_scan_pos + 1;
var title_text_end = title_text_start;
while(title_text_end < url_title_content.len) {
if (url_title_content[title_text_end] == '\\' and title_text_end + 1 < url_title_content.len) {
title_text_end += 2; continue;
}
if (url_title_content[title_text_end] == title_closing_char) break;
title_text_end +=1;
}
if (title_text_end < url_title_content.len and url_title_content[title_text_end] == title_closing_char) {
link_title_val = try self.allocator.dupe(u8, url_title_content[title_text_start..title_text_end]);
// TODO: unescape title
content_scan_pos = title_text_end + 1;
}
}
}
// Skip trailing whitespace in ()
while(content_scan_pos < url_title_content.len and ascii.isWhitespace(url_title_content[content_scan_pos])) : (content_scan_pos += 1) {}
if (content_scan_pos == url_title_content.len) { // Parsed successfully
var node = Node.init(self.allocator, if (is_image) .Image else .Link);
errdefer node.deinit();
node.url = try self.allocator.dupe(u8, dest_url); // TODO: unescape URL
errdefer if(node.url) |u| self.allocator.free(u);
node.title = link_title_val; // Already dupe'd or null
errdefer if(node.title) |t| self.allocator.free(t);
if (is_image) {
node.alt_text = try self.allocator.dupe(u8, link_text_slice); // TODO: unescape alt
errdefer if(node.alt_text) |al| self.allocator.free(al);
} else {
try self.parseInlines(link_text_slice, &node, context);
}
try parent_node.appendChild(node);
current_pos = url_p_end + 1;
text_segment_start = current_pos;
consumed_by_special_parser = true;
} else {
// Malformed inline link, treat '[' as literal. Falls through.
if (link_title_val) |ltv| self.allocator.free(ltv);
}
}
}
// TODO: Reference links [text][label], [label][], [label]
// This requires looking up `label` in `self.link_references`.
// For brevity here, only inline links are handled.
}
}
// 5. Hard Line Breaks ( \n) - this is tricky here as newlines are usually preprocessed.
// If `text` comes from a paragraph that joined lines with spaces, this info is lost.
// If `text` can contain `\n`, check for ` \n`.
// The current paragraph construction joins with spaces, so hard breaks this way are not parseable from `combined_text`.
// A more robust parser might pass line objects to parseInlines or handle breaks earlier.
// For now, assume `\n` in `text` might represent a explicit request for a break (e.g. from an AST transform).
// If we want to parse ` \n` as hard break:
// A paragraph's content should not be pre-joined with spaces but passed with original newlines.
// Then here: if text[current_pos-2..current_pos+1] == " \n"
else if (char == '\n') { // Simplified: treat all newlines in inline content as potential line breaks
// This is not standard Markdown behavior for typical inline parsing.
// Usually, newlines in paragraph source are soft breaks.
// This indicates `text` was not fully normalized or comes from a source where `\n` is significant.
// Let's assume for this simplified parser, explicit `\n` in the input `text` to `parseInlines`
// should become a line break node. This means `tryParseParagraphOrSetext` should probably
// not join lines with spaces but rather with `\n` and let parseInlines handle soft/hard.
// For now: make `\n` a simple `Text` node with newline content. Renderer can decide.
// Or, if two spaces precede it:
if (current_pos >=2 and text[current_pos-1] == ' ' and text[current_pos-2] == ' ') {
try flushTextSegment(); // Flush text before the two spaces
text_segment_start = current_pos - 2; // Point to start of " "
try flushTextSegment(); // Flush the " " as text or discard
text_segment_start = current_pos; // Reset
var br_node = Node.init(self.allocator, .LineBreak);
errdefer br_node.deinit();
try parent_node.appendChild(br_node);
current_pos += 1; // consume \n
text_segment_start = current_pos;
consumed_by_special_parser = true;
}
}
// TODO: Autolinks <http://foo.bar>, <mailto:[email protected]>
// TODO: Raw HTML tags <a>...</a>
if (!consumed_by_special_parser) {
current_pos += 1;
}
}
// Flush any remaining text
const final_pos = current_pos; // Save before flushTextSegment potentially changes current_pos view
current_pos = text.len; // Ensure flushTextSegment flushes up to the end
try flushTextSegment();
current_pos = final_pos; // Restore (though not strictly needed as loop ends)
text_segment_start = current_pos;
}
fn findAndParseEmphasisLike(self: *Parser, text: []const u8, start_pos: usize, marker_slice: []const u8, parent_node: *Node, text_segment_start_ptr: *usize, current_pos_ptr: *usize) !bool {
var text_segment_start = text_segment_start_ptr.*;
var current_pos = current_pos_ptr.*;
const flushTextSegment = () !void {
if (text_segment_start < start_pos) { // Flush text before the marker
var text_node = Node.init(self.allocator, .Text);
text_node.content = try self.allocator.dupe(u8, text[text_segment_start..start_pos]);
errdefer text_node.deinit();
try parent_node.appendChild(text_node);
}
text_segment_start = start_pos; // Reset for next segment, before marker itself
};
var search_offset = start_pos + marker_slice.len;
while (search_offset < text.len) {
if (text[search_offset] == marker_slice[0]) { // Potential end marker
if (text.len >= search_offset + marker_slice.len and
mem.eql(u8, text[search_offset .. search_offset + marker_slice.len], marker_slice))
{
// Found matching end marker. Basic validation:
// Cannot be empty content: search_offset > start_pos + marker_slice.len
// GFM rules for flanking are complex. This is simplified.
if (search_offset == start_pos + marker_slice.len) { // Empty, e.g. ****, __ __
search_offset += marker_slice.len; // Skip and continue search
continue;
}
try flushTextSegment(); // Flush text before marker
const node_type = if (marker_slice.len == 2) {
if (marker_slice[0] == '~') .Strikethrough else .Strong
} else .Emphasis;
var emph_node = Node.init(self.allocator, node_type);
errdefer emph_node.deinit();
const inner_text = text[start_pos + marker_slice.len .. search_offset];
try self.parseInlines(inner_text, &emph_node, .{});
// Only add if it actually contains something (or is an image, which is childless)
if (emph_node.children.items.len > 0) {
try parent_node.appendChild(emph_node);
current_pos = search_offset + marker_slice.len;
text_segment_start_ptr.* = current_pos;
current_pos_ptr.* = current_pos;
return true;
} else {
emph_node.deinit(); // Empty emphasis, treat as literal
// Fall through to treat markers as literal
return false; // Could not form valid emphasis
}
}
}
// Handle escaped markers within content:
if (text[search_offset] == '\\' and search_offset + 1 < text.len) {
search_offset += 2;
} else {
search_offset += 1;
}
}
return false; // No matching end marker found
}
// --- Block helper predicates ---
fn isThematicBreakStart(line_content: []const u8) bool {
// Simplified check, actual parsing is in tryParseThematicBreak
var marker_char: u8 = 0;
var count: usize = 0;
var non_marker_found = false;
for (line_content) |char| {
if (char == '*' or char == '-' or char == '_') {
if (marker_char == 0) marker_char = char;
else if (char != marker_char) { non_marker_found = true; break; }
count += 1;
} else if (char == ' ' or char == '\t') {}
else if (char == '\n' or char == '\r') break;
else { non_marker_found = true; break; }
}
return !non_marker_found and count >= 3;
}
fn isAtxHeadingStart(line_content: []const u8) bool {
if (line_content.len == 0) return false;
return line_content[0] == '#';
}
fn isFencedCodeBlockStart(line_content: []const u8) bool {
if (line_content.len < 3) return false;
const c = line_content[0];
if (c != '`' and c != '~') return false;
return line_content[1] == c and line_content[2] == c;
}
const ListItemType = enum { NotListItem, Unordered, Ordered };
fn isListItemStart(line_content: []const u8) ListItemType {
var i: usize = 0;
// Skip leading spaces on the content line (already stripped by indent)
while(i < line_content.len and line_content[i] == ' ') : (i +=1);
if (i >= line_content.len) return .NotListItem;
// Unordered
if (line_content[i] == '*' or line_content[i] == '-' or line_content[i] == '+') {
if (i + 1 < line_content.len and (line_content[i+1] == ' ' or line_content[i+1] == '\t' or line_content[i+1] == '\n')) {
return .Unordered;
}
}
// Ordered
var num_start = i;
while(i < line_content.len and ascii.isDigit(line_content[i])) : (i +=1);
if (i > num_start and i < line_content.len and (line_content[i] == '.' or line_content[i] == ')')) {
if (i + 1 < line_content.len and (line_content[i+1] == ' ' or line_content[i+1] == '\t' or line_content[i+1] == '\n')) {
// CommonMark: ordered list marker max 9 digits
if (i - num_start <= 9) {
return .Ordered;
}
}
}
return .NotListItem;
}
fn isBlockquoteStart(line_content: []const u8) bool {
if (line_content.len == 0) return false;
return line_content[0] == '>';
}
fn getSetextUnderlineLevel(line: []const u8) u8 {
const indent = getIndentWidth(line, TAB_STOP);
if (indent >= TAB_STOP) return 0;
const content = stripIndent(line, indent, TAB_STOP);
if (content.len == 0) return 0;
const marker = content[0];
if (marker != '=' and marker != '-') return 0;
for (content) |char| {
if (char != marker and char != ' ' and char != '\t' and char != '\n' and char != '\r') return 0;
if (char == '\n' or char == '\r') break;
}
// Must contain at least one marker char
var has_marker = false;
for (content) |char| if(char == marker) {has_marker = true; break;}
if(!has_marker) return 0;
return if (marker == '=') 1 else 2;
}
// --- Complex Block Parsers (Blockquote, Lists) ---
// These often involve recursive parsing of their contents.
fn tryParseBlockquote(self: *Parser, parent_node: *Node) !bool {
const initial_line_idx = self.current_line_idx;
var line = self.currentLine() orelse return false;
const indent = getIndentWidth(line, TAB_STOP);
if (indent >= TAB_STOP) return false;
var content_line = stripIndent(line, indent, TAB_STOP);
if (!isBlockquoteStart(content_line)) return false;
var quote_node = Node.init(self.allocator, .Blockquote);
errdefer quote_node.deinit();
var lines_for_blockquote_content = ArrayList([]const u8).init(self.allocator);
defer {
for(lines_for_blockquote_content.items) |l| self.allocator.free(l);
lines_for_blockquote_content.deinit();
}
while (self.currentLine()) |current_bq_line| {
const current_bq_line_indent = getIndentWidth(current_bq_line, TAB_STOP);
var current_bq_content_line = stripIndent(current_bq_line, current_bq_line_indent, TAB_STOP);
if (isBlockquoteStart(current_bq_content_line)) {
// Consume '>' and optional space
current_bq_content_line = current_bq_content_line[1..];
if (current_bq_content_line.len > 0 and (current_bq_content_line[0] == ' ' or current_bq_content_line[0] == '\t')) {
current_bq_content_line = current_bq_content_line[1..];
}
try lines_for_blockquote_content.append(try self.allocator.dupe(u8, current_bq_content_line));
self.advanceLine();
} else {
// Check for lazy continuation: if line is not blank and doesn't start another block type
// that would break the blockquote (e.g. thematic break, heading, fenced code block)
if (isBlankLine(current_bq_line)) break; // Blank line ends blockquote unless nested. This is simplified.
// If the line could be a thematic break, ATX heading, fenced code block start, etc.,
// it terminates the current blockquote.
if (isThematicBreakStart(current_bq_content_line) or
isAtxHeadingStart(current_bq_content_line) or
isFencedCodeBlockStart(current_bq_content_line) or
getSetextUnderlineLevel(current_bq_content_line) > 0) // Check if it's a setext underline
{
break;
}
// Check if it's a new list item or another blockquote (these typically don't lazily continue paras in bq)
if (isListItemStart(current_bq_content_line) != .NotListItem) break;
// If it's not a marker line, but parent is quote, and this line is part of a paragraph
// This is "lazy continuation". Add it.
// A more robust check would ensure that this line doesn't start a *different* block type.
// For simplicity: any non-`>` prefixed line that's not blank and not an interrupting block type continues.
try lines_for_blockquote_content.append(try self.allocator.dupe(u8, current_bq_line)); // Keep original indent for sub-parsing
self.advanceLine();
}
}
if (lines_for_blockquote_content.items.len == 0) {
quote_node.deinit();
self.current_line_idx = initial_line_idx;
return false;
}
// Create a sub-parser for the blockquote content
if (self.recursion_depth >= MAX_RECURSION_DEPTH) {
// TODO: How to handle this error? Maybe just parse as text.
// For now, skip appending.
dbgPrint("Max recursion depth reached in blockquote.\n", .{});
return true; // Consumed lines, but didn't produce valid content due to depth.
}
self.recursion_depth += 1;
defer self.recursion_depth -= 1;
// Need to reconstruct the text for the sub-parser
var sub_parser_text_buf = ArrayList(u8).init(self.allocator);
defer sub_parser_text_buf.deinit();
for(lines_for_blockquote_content.items) |l| {
try sub_parser_text_buf.appendSlice(l);
// Ensure newlines if not already there (dupe'd lines should have them from splitLines)
}
if (sub_parser_text_buf.items.len > 0) {
var sub_parser = try Parser.init(self.allocator, try sub_parser_text_buf.toOwnedSlice());
defer sub_parser.deinit();
// Copy link references
var ref_iter = self.link_references.iterator();
while(ref_iter.next()) |entry| {
try sub_parser.link_references.put(
try self.allocator.dupe(u8, entry.key_ptr.*),
LinkRef {
.url = try self.allocator.dupe(u8, entry.value_ptr.url),
.title = if (entry.value_ptr.title) |t| try self.allocator.dupe(u8, t) else null,
}
);
}
var sub_document_node = try sub_parser.parseDocument(); // This creates a Document node
// Transfer children from sub_document_node to quote_node
for (sub_document_node.children.items) |child_node| {
try quote_node.appendChild(child_node); // This moves ownership if Node is not copyable
}
sub_document_node.children.items = undefined; // Nullify to prevent double deinit by sub_document_node.deinit()
sub_document_node.deinit();
}
try parent_node.appendChild(quote_node);
return true;
}
// tryParseUnorderedList and tryParseOrderedList are very complex due to nesting,
// lazy continuation, and determining "tight" vs "loose".
// This is a simplified version.
fn tryParseUnorderedList(self: *Parser, parent_node: *Node, current_block_indent: usize) !bool {
return try self.tryParseList(.Unordered, parent_node, current_block_indent);
}
fn tryParseOrderedList(self: *Parser, parent_node: *Node, current_block_indent: usize) !bool {
return try self.tryParseList(.Ordered, parent_node, current_block_indent);
}
fn tryParseList(self: *Parser, list_type_check: ListItemType, parent_node: *Node, current_block_indent: usize) !bool {
const initial_line_idx = self.current_line_idx;
var line = self.currentLine() orelse return false;
var item_indent = getIndentWidth(line, TAB_STOP);
var content_after_indent = stripIndent(line, item_indent, TAB_STOP);
const current_item_type_info = self.getListItemInfo(content_after_indent);
if (current_item_type_info.item_type != list_type_check) return false;
if (item_indent < current_block_indent) return false; // Must be at least same indent level
var list_node = Node.init(self.allocator, if (list_type_check == .Unordered) .UnorderedList else .OrderedList);
errdefer list_node.deinit();
if (list_type_check == .Ordered) {
list_node.start_number = current_item_type_info.number;
}
var is_tight = true; // Assume tight initially
// Loop for list items
while (self.currentLine()) |item_line_loop| {
const loop_item_indent = getIndentWidth(item_line_loop, TAB_STOP);
const loop_content_after_indent = stripIndent(item_line_loop, loop_item_indent, TAB_STOP);
const loop_item_info = self.getListItemInfo(loop_content_after_indent);
if (loop_item_info.item_type != list_type_check) { // Not this kind of list item
// Could it be a continuation of the *previous* list item?
// Or does it end the list?
// If it's blank, it might indicate loose list or end.
// If it's indented sufficiently, it's a continuation.
if (isBlankLine(item_line_loop)) {
// A blank line *between* list items makes the list loose.
// If we have items already, this blank line might separate.
if (list_node.children.items.len > 0) {
// Peek ahead: if next line is another list item of same type/indent, this blank line makes list loose.
if (self.peekLine(1)) |peeked_line| {
const peek_indent = getIndentWidth(peeked_line, TAB_STOP);
const peek_content = stripIndent(peeked_line, peek_indent, TAB_STOP);
const peek_info = self.getListItemInfo(peek_content);
if (peek_info.item_type == list_type_check and peek_indent >= item_indent) { // item_indent of first item
is_tight = false;
}
}
}
// A blank line might also just be absorbed by an item if it's part of its content.
// This simple parser will have it terminate the list if not clearly continuable.
break; // End list on blank line for simplicity here.
}
if (loop_item_indent < item_indent) break; // Dedent ends the list.
// else: it's indented enough to be content of previous item. Fall through to item parsing.
}
// If we are here, it's either a new list item of the expected type,
// or content that should be part of the current (being parsed) list item.
// This logic is for starting a *new* item or breaking.
// The actual item content parsing is below.
if (loop_item_info.item_type == list_type_check && loop_item_indent >= item_indent) {
// It's a new item of the same list.
// item_indent needs to be updated if this new item is less indented but still valid.
item_indent = loop_item_indent; // The current item dictates the base indent for its content.
} else {
// Not a new item of same type/indent. This means it's either content for the *previous* item
// or the list ends. This loop structure implies list ends.
// The logic to gather multi-line item content needs to be inside item parsing.
break;
}
// Parse this list item
var list_item_node = Node.init(self.allocator, .ListItem);
errdefer list_item_node.deinit();
if (loop_item_info.is_task_list) {
list_item_node.is_task_list_item = true;
list_item_node.is_task_list_item_checked = loop_item_info.is_task_checked;
}
// Collect lines for this single list item
var item_content_lines = ArrayList([]const u8).init(self.allocator);
defer {
for(item_content_lines.items) |l| self.allocator.free(l);
item_content_lines.deinit();
}
// First line of item content (after marker)
const first_content_part = loop_content_after_indent[loop_item_info.marker_len ..];
try item_content_lines.append(try self.allocator.dupe(u8, mem.trimLeft(u8, first_content_part, " \t")));
self.advanceLine();
// Subsequent lines for the item
const content_indent_needed = item_indent + loop_item_info.marker_len + 1; // Approx. May depend on tab usage.
// More robust: indent needed is just past marker.
// Or, for subsequent lines, at least item_indent + some_minimum (e.g., 2 spaces).
// CommonMark: continuation indent is complex.
// Simplified: subsequent lines indented same or more than first content line.
while(self.currentLine()) |continuation_line| {
const cont_indent = getIndentWidth(continuation_line, TAB_STOP);
if (isBlankLine(continuation_line)) {
// A blank line. If followed by properly indented content, it's part of this item (making it loose).
// If followed by new item or end of list, it separates.
if (self.peekLine(1)) |peek_cont| {
const peek_cont_indent = getIndentWidth(peek_cont, TAB_STOP);
// If next line is new item or less indented, this blank line is a separator.
const peek_cont_content = stripIndent(peek_cont, peek_cont_indent, TAB_STOP);
const peek_cont_info = self.getListItemInfo(peek_cont_content);
if (peek_cont_info.item_type == list_type_check && peek_cont_indent >= item_indent) { // Next is another item
is_tight = false; // Blank line between current item and next.
break; // End current item's content.
}
if (peek_cont_indent < item_indent + 2) { // Arbitrary continuation indent threshold
is_tight = false;
break; // End current item.
}
} else { // EOF after blank
is_tight = false;
break;
}
// Otherwise, this blank line is part of current item's content.
try item_content_lines.append(try self.allocator.dupe(u8, "\n")); // Represent blank line
is_tight = false; // Blank line within an item's content makes list loose.
self.advanceLine();
continue;
}
// If this line starts a new list item of the same type/level, current item ends.
const next_item_content = stripIndent(continuation_line, cont_indent, TAB_STOP);
const next_item_info = self.getListItemInfo(next_item_content);
if (next_item_info.item_type == list_type_check && cont_indent == item_indent) {
break; // Start of a new sibling item
}
if (cont_indent < item_indent || cont_indent < current_block_indent + 2 ) { // Heuristic for continuation indent.
// current_block_indent + TAB_STOP is safer for code blocks.
// For simple text, current_block_indent + loop_item_info.marker_len could be it.
break; // Not indented enough to be a continuation.
}
// Add stripped line. The amount to strip is `item_indent + marker_width_equivalent` (usually marker_len + 1 space)
// or just `item_indent + some_fixed_amount` for subsequent paras.
// Simplified: strip up to `item_indent + 2` or `content_indent_needed`.
// A common heuristic is to strip by the indent of the first line of content.
// Or by `item_indent + marker_len + (1 if space after marker)`.
const effective_strip_indent = @min(cont_indent, item_indent + loop_item_info.marker_len + 1); // Approximation
try item_content_lines.append(try self.allocator.dupe(u8, stripIndent(continuation_line, effective_strip_indent, TAB_STOP)));
self.advanceLine();
}
// Parse collected lines for the item.
// This is where a sub-parser or recursive call to block parsing for item_content_lines would go.
// For simplicity, treat item content as single paragraph with inlines.
var item_full_content_buf = ArrayList(u8).init(self.allocator);
defer item_full_content_buf.deinit();
for(item_content_lines.items, 0..) |icl_idx, i| {
try item_full_content_buf.appendSlice(trimLine(icl_idx));
if (i < item_content_lines.items.len -1) {
try item_full_content_buf.append(' ');
}
}
if (item_full_content_buf.items.len > 0) {
try self.parseInlines(item_full_content_buf.items, &list_item_node, .{});
}
// If an item contains a blank line, or if there's a blank line between items, list is loose.
// (is_tight already tracks this)
try list_node.appendChild(list_item_node);
}
if (list_node.children.items.len == 0) {
list_node.deinit();
self.current_line_idx = initial_line_idx;
return false;
}
list_node.tight = is_tight;
try parent_node.appendChild(list_node);
return true;
}
const ListItemInfo = struct {
item_type: ListItemType,
marker_len: usize,
number: ?u64, // For ordered lists
is_task_list: bool = false,
is_task_checked: bool = false,
};
fn getListItemInfo(self: *Parser, line_content: []const u8) ListItemInfo {
_ = self;
var i: usize = 0;
// Skip leading spaces on the line_content itself (already indent-stripped)
// while(i < line_content.len and line_content[i] == ' ') : (i +=1); // Should not be needed if line_content is properly prepared
// if (i >= line_content.len) return .{ .item_type = .NotListItem, .marker_len = 0, .number = null };
const original_i = i;
// Unordered: *, -, +
if (i < line_content.len and (line_content[i] == '*' or line_content[i] == '-' or line_content[i] == '+')) {
const marker_char = line_content[i];
i += 1;
if (i < line_content.len and (line_content[i] == ' ' or line_content[i] == '\t' or line_content[i] == '\n')) {
// Check for task list item: `* [ ] ` or `* [x] `
var after_marker_and_space = i + 1; // Skip the space/tab
while(after_marker_and_space < line_content.len and (line_content[after_marker_and_space] == ' ' or line_content[after_marker_and_space] == '\t')) : (after_marker_and_space+=1);
if (after_marker_and_space + 2 < line_content.len and line_content[after_marker_and_space] == '[') {
const cb_content = line_content[after_marker_and_space+1];
if ((cb_content == ' ' or cb_content == 'x' or cb_content == 'X') and
line_content[after_marker_and_space+2] == ']')
{
// Potential task list. Must be followed by space or end of line.
if (after_marker_and_space + 3 == line_content.len or line_content[after_marker_and_space+3] == ' ' or line_content[after_marker_and_space+3] == '\t' or line_content[after_marker_and_space+3] == '\n') {
return .{
.item_type = .Unordered,
.marker_len = (after_marker_and_space + 3) - original_i, // Marker includes `* [ ] `
.number = null,
.is_task_list = true,
.is_task_checked = (cb_content == 'x' or cb_content == 'X'),
};
}
}
}
return .{ .item_type = .Unordered, .marker_len = i - original_i, .number = null };
}
i = original_i; // Reset if not valid marker
}
// Ordered: 1. 1)
const num_start = i;
while(i < line_content.len and ascii.isDigit(line_content[i])) : (i +=1);
if (i > num_start and i < line_content.len and (line_content[i] == '.' or line_content[i] == ')')) {
const num_str = line_content[num_start..i];
const number = std.fmt.parseUnsigned(u64, num_str, 10) catch |err| {
// Should not happen if ascii.isDigit passed
std.debug.print("Error parsing list number: {s} ({any})\n", .{num_str, err});
return .{ .item_type = .NotListItem, .marker_len = 0, .number = null };
};
i += 1; // consume . or )
if (i < line_content.len and (line_content[i] == ' ' or line_content[i] == '\t' or line_content[i] == '\n')) {
if (i - num_start -1 <= 9) { // number part <= 9 digits
return .{ .item_type = .Ordered, .marker_len = i - original_i, .number = number };
}
}
i = original_i; // Reset
}
return .{ .item_type = .NotListItem, .marker_len = 0, .number = null };
}
fn tryParseHtmlBlock(self: *Parser, parent_node: *Node) !bool {
// This is a very simplified HTML block parser. CommonMark has 7 types of HTML blocks.
// This will try to match simple <tag>...</tag> or self-closing <tag /> on a line.
const initial_line_idx = self.current_line_idx;
var line = self.currentLine() orelse return false;
const indent = getIndentWidth(line, TAB_STOP);
if (indent >= TAB_STOP) return false;
const content = mem.trimLeft(u8, stripIndent(line, indent, TAB_STOP), " \t");
if (content.len == 0 or content[0] != '<') return false;
// Try to identify some common block tags that CM considers HTML Block Type 1 or 6
const block_tags = [_][]const u8{
"pre", "script", "style", "textarea", // Type 1 (content is literal until closing tag)
"address", "article", "aside", "base", "basefont", "blockquote", "body",
"caption", "center", "col", "colgroup", "dd", "details", "dialog",
"dir", "div", "dl", "dt", "fieldset", "figcaption", "figure",
"footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6",
"head", "header", "hr", "html", "iframe", "legend", "li", "link",
"main", "menu", "menuitem", "nav", "noframes", "ol",
"optgroup", "option", "p", "param", "section", "source", "summary",
"table", "tbody", "td", "tfoot", "th", "thead", "title", "tr", "track", "ul",
};
var tag_name_end: usize = 1;
while(tag_name_end < content.len and ascii.isAlphanumeric(content[tag_name_end])) : (tag_name_end+=1) {}
if (tag_name_end == 1) return false; // No tag name like `<>` or `<1`
const tag_name = content[1..tag_name_end];
var is_known_block_tag = false;
for(block_tags) |bt| {
if (mem.eql(u8, ascii.lowerSlice(bt), ascii.lowerSlice(tag_name))) { // Case-insensitive tag match
is_known_block_tag = true;
break;
}
}
if (!is_known_block_tag) return false; // Not a recognized block-level HTML tag start
// Type 1 (pre, script, style, textarea): content is literal until closing tag
// Type 6 (other block tags): ends with a blank line.
// This simplified parser will treat all matched tags as ending with a blank line, or a closing tag on its own line.
var html_content_buf = ArrayList(u8).init(self.allocator);
defer html_content_buf.deinit();
var current_html_line_idx = initial_line_idx;
while(self.input_lines.items.len > current_html_line_idx) {
const current_html_line = self.input_lines.items[current_html_line_idx];
try html_content_buf.appendSlice(current_html_line); // Includes newline
// Simplistic: end on blank line or if we see a matching closing tag for some specific types
if (current_html_line_idx > initial_line_idx && isBlankLine(current_html_line)) {
current_html_line_idx +=1; // consume blank line
break;
}
// For tags like <script>, <style>, <pre>, look for their specific closing tag.
if (mem.eql(u8, tag_name, "script") or mem.eql(u8, tag_name, "style") or mem.eql(u8, tag_name, "pre") or mem.eql(u8, tag_name, "textarea")) {
var closing_tag_buf = ArrayList(u8).init(self.allocator);
defer closing_tag_buf.deinit();
try closing_tag_buf.appendSlice("</");
try closing_tag_buf.appendSlice(tag_name);
try closing_tag_buf.append('>');
if (mem.indexOf(u8, ascii.lowerSlice(current_html_line), ascii.lowerSlice(closing_tag_buf.items)) != null) {
current_html_line_idx +=1; // consume line with closing tag
break;
}
}
current_html_line_idx +=1;
}
if (html_content_buf.items.len == 0) return false;
var html_node = Node.init(self.allocator, .HtmlBlock);
errdefer html_node.deinit();
html_node.content = try html_content_buf.toOwnedSlice();
errdefer if(html_node.content) |c| self.allocator.free(c);
try parent_node.appendChild(html_node);
self.current_line_idx = current_html_line_idx; // Advance main parser state
return true;
}
}; // End Parser struct
fn isMarkdownPunctuation(char: u8) bool {
return switch (char) {
'!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/',
':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~' => true,
else => false,
};
}
pub fn parseMarkdown(allocator: Allocator, markdown_text: []const u8) !Node {
var parser = try Parser.init(allocator, markdown_text);
defer parser.deinit();
return parser.parseDocument();
}
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const markdown_input = comptime blk: {
var buf: [10000]u8 = undefined;
var stream = std.io.fixedBufferStream(&buf);
const writer = stream.writer();
try writer.writeAll(
\\# Heading 1
\\
\\This is a paragraph with *italic* and **bold** text.
\\And `inline code`. Here's a [link](http://example.com "Example").
\\And an image ![alt text](/path/to/image.png "Image Title").
\\
\\> Blockquote line 1
\\> Blockquote line 2
\\> > Nested blockquote
\\> Back to first level.
\\
\\```javascript
\\function hello() {
\\ console.log("Hello, Fenced Code!");
\\}
\\```
\\
\\ function indentedCode() {
\\ return true;
\\ }
\\
\\---
\\
\\* Unordered item 1
\\* Unordered item 2
\\ * Nested item 2.1 (Tight list handling is simplified)
\\ * Deeper Nested 2.1.1
\\* Unordered item 3
\\
\\1. Ordered item 1
\\2. Ordered item 2
\\ With a continuation.
\\300. Ordered item 3 (starts at 300)
\\
\\A paragraph that will become
\\=============================
\\A Setext L1 Heading.
\\
\\Another para for L2
\\---------------------
\\
\\Strikethrough ~~deleted text~~.
\\
\\[refdef]: /url/ "My Ref Title"
\\This is a ref [link][refdef]. And [refdef][]. And [implicit refdef].
\\
\\<p>This is an HTML block.</p>
\\<div>
\\ Another HTML block line.
\\</div>
\\
\\* [ ] Unchecked task
\\* [x] Checked task
\\
);
break :blk buf[0..stream.pos];
};
std.debug.print("---MARKDOWN INPUT---\n{s}\n---END INPUT---\n\n", .{markdown_input});
const ast_root = try parseMarkdown(allocator, markdown_input);
defer ast_root.deinit();
std.debug.print("---AST OUTPUT---\n", .{});
ast_root.print(0);
std.debug.print("---END AST OUTPUT---\n", .{});
}
// Minimal test runner, can be expanded.
test "basic heading and paragraph" {
var test_allocator = std.testing.allocator;
const md = "# Test\n\nA paragraph.";
var root = try parseMarkdown(test_allocator, md);
defer root.deinit();
try std.testing.expectEqual(root.type, NodeType.Document);
try std.testing.expectEqual(root.children.items.len, 2);
const heading = root.children.items[0];
try std.testing.expectEqual(heading.type, NodeType.Heading);
try std.testing.expectEqual(heading.level.?, 1);
try std.testing.expectEqual(heading.children.items.len, 1); // Text node "Test"
try std.testing.expectEqualSlices(u8, heading.children.items[0].content.?, "Test");
const para = root.children.items[1];
try std.testing.expectEqual(para.type, NodeType.Paragraph);
try std.testing.expectEqual(para.children.items.len, 1); // Text node "A paragraph."
try std.testing.expectEqualSlices(u8, para.children.items[0].content.?, "A paragraph.");
}
test "inline strong and emphasis" {
var test_allocator = std.testing.allocator;
const md = "**bold** *italic*";
var root = try parseMarkdown(test_allocator, md);
defer root.deinit();
try std.testing.expectEqual(root.type, NodeType.Document);
try std.testing.expectEqual(root.children.items.len, 1); // Paragraph
const para = root.children.items[0];
try std.testing.expectEqual(para.children.items.len, 3); // Strong, Text(" "), Emphasis
const strong_node = para.children.items[0];
try std.testing.expectEqual(strong_node.type, NodeType.Strong);
try std.testing.expectEqual(strong_node.children.items[0].content.?, "bold");
const space_node = para.children.items[1];
try std.testing.expectEqual(space_node.type, NodeType.Text);
try std.testing.expectEqual(space_node.content.?, " ");
const em_node = para.children.items[2];
try std.testing.expectEqual(em_node.type, NodeType.Emphasis);
try std.testing.expectEqual(em_node.children.items[0].content.?, "italic");
}
test "fenced code block" {
var test_allocator = std.testing.allocator;
const md = "```rust\nlet x = 10;\n```";
var root = try parseMarkdown(test_allocator, md);
defer root.deinit();
try std.testing.expectEqual(root.children.items.len, 1);
const fcb = root.children.items[0];
try std.testing.expectEqual(fcb.type, NodeType.FencedCodeBlock);
try std.testing.expectEqualSlices(u8, fcb.info_string.?, "rust");
try std.testing.expectEqualSlices(u8, fcb.content.?, "let x = 10;\n");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment