-
-
Save jdmichaud/794fb2fdfcad4a2306da4cdee51d7b8c to your computer and use it in GitHub Desktop.
Markdown parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <stdbool.h> | |
#include <ctype.h> | |
#define MAX_LINE_LENGTH 4096 // Max line length for reading input (can be adjusted) | |
#define INITIAL_CHILDREN_CAPACITY 4 | |
#define TAB_STOP_WIDTH 4 | |
// --- Enums and Structs --- | |
typedef enum { | |
MD_NODE_DOCUMENT, | |
MD_NODE_PARAGRAPH, | |
MD_NODE_HEADING, | |
MD_NODE_BLOCKQUOTE, | |
MD_NODE_UNORDERED_LIST, | |
MD_NODE_ORDERED_LIST, | |
MD_NODE_LIST_ITEM, | |
MD_NODE_CODE_BLOCK, // Indented code block | |
MD_NODE_FENCED_CODE_BLOCK, | |
MD_NODE_HORIZONTAL_RULE, | |
MD_NODE_HTML_BLOCK, // Basic support | |
// Inline elements | |
MD_NODE_TEXT, | |
MD_NODE_EMPHASIS, | |
MD_NODE_STRONG, | |
MD_NODE_STRIKETHROUGH, | |
MD_NODE_INLINE_CODE, | |
MD_NODE_LINK, | |
MD_NODE_IMAGE, | |
MD_NODE_SOFT_BREAK, | |
MD_NODE_HARD_BREAK, | |
MD_NODE_HTML_INLINE, // Basic support | |
MD_NODE_ESCAPED_CHAR | |
} MDNodeType; | |
typedef struct MDNode { | |
MDNodeType type; | |
char* text_content; // For TEXT, INLINE_CODE, HTML_INLINE/BLOCK, ESCAPED_CHAR | |
char* code_language; // For FENCED_CODE_BLOCK | |
char* code_content; // For CODE_BLOCK, FENCED_CODE_BLOCK | |
int heading_level; // For HEADING (1-6) | |
char* url; // For LINK, IMAGE | |
char* title; // For LINK, IMAGE (optional) | |
char* alt_text; // For IMAGE | |
char list_item_marker; // For LIST_ITEM ('*', '-', '+', '.', ')') | |
int list_start_number; // For ORDERED_LIST (if applicable for the list itself) | |
bool tight_list; // For lists (true if no blank lines between items) | |
struct MDNode* parent; // Optional: points to parent node | |
struct MDNode** children; | |
size_t children_count; | |
size_t children_capacity; | |
} MDNode; | |
// --- Utility Functions --- | |
char* md_strdup(const char* s) { | |
if (!s) return NULL; | |
size_t len = strlen(s); | |
char* d = (char*)malloc(len + 1); | |
if (!d) return NULL; | |
memcpy(d, s, len + 1); | |
return d; | |
} | |
char* md_strndup(const char* s, size_t n) { | |
if (!s) return NULL; | |
char* d = (char*)malloc(n + 1); | |
if (!d) return NULL; | |
memcpy(d, s, n); | |
d[n] = '\0'; | |
return d; | |
} | |
const char* trim_leading_whitespace(const char* str) { | |
while (*str && isspace((unsigned char)*str)) { | |
str++; | |
} | |
return str; | |
} | |
char* trim_trailing_whitespace(char* str) { | |
if (!str || !*str) return str; | |
char* end = str + strlen(str) - 1; | |
while (end >= str && isspace((unsigned char)*end)) { | |
end--; | |
} | |
*(end + 1) = '\0'; | |
return str; | |
} | |
int count_leading_spaces(const char* line) { | |
int count = 0; | |
while (line[count] == ' ') { | |
count++; | |
} | |
return count; | |
} | |
bool is_blank_line(const char* line) { | |
while (*line) { | |
if (!isspace((unsigned char)*line)) { | |
return false; | |
} | |
line++; | |
} | |
return true; | |
} | |
char* expand_tabs(const char* line) { | |
int len = strlen(line); | |
// Estimate new length: each tab could become TAB_STOP_WIDTH spaces | |
char* new_line = (char*)malloc(len * TAB_STOP_WIDTH + 1); | |
if (!new_line) return NULL; | |
int current_col = 0; | |
int j = 0; // index for new_line | |
for (int i = 0; line[i]; ++i) { | |
if (line[i] == '\t') { | |
int spaces_to_add = TAB_STOP_WIDTH - (current_col % TAB_STOP_WIDTH); | |
for (int k = 0; k < spaces_to_add; ++k) { | |
new_line[j++] = ' '; | |
} | |
current_col += spaces_to_add; | |
} else { | |
new_line[j++] = line[i]; | |
current_col++; | |
} | |
} | |
new_line[j] = '\0'; | |
char* final_line = (char*)realloc(new_line, j + 1); // Resize to actual | |
return final_line ? final_line : new_line; // Return original if realloc fails | |
} | |
// --- AST Node Management --- | |
MDNode* md_node_new(MDNodeType type) { | |
MDNode* node = (MDNode*)calloc(1, sizeof(MDNode)); | |
if (!node) { | |
perror("Failed to allocate MDNode"); | |
// In a real library, might return NULL and let caller handle. | |
// For this single file example, exiting is simpler. | |
exit(EXIT_FAILURE); | |
} | |
node->type = type; | |
return node; | |
} | |
void md_node_add_child(MDNode* parent, MDNode* child) { | |
if (!parent || !child) return; | |
if (parent->children_count >= parent->children_capacity) { | |
size_t new_capacity = parent->children_capacity == 0 ? INITIAL_CHILDREN_CAPACITY : parent->children_capacity * 2; | |
MDNode** new_children = (MDNode**)realloc(parent->children, new_capacity * sizeof(MDNode*)); | |
if (!new_children) { | |
perror("Failed to reallocate children array"); | |
// This is a critical failure. For simplicity, we might lose the child. | |
// A robust solution would handle this more gracefully. | |
return; | |
} | |
parent->children = new_children; | |
parent->children_capacity = new_capacity; | |
} | |
parent->children[parent->children_count++] = child; | |
child->parent = parent; // Set parent pointer | |
} | |
void md_node_free(MDNode* node) { | |
if (!node) return; | |
free(node->text_content); | |
free(node->code_language); | |
free(node->code_content); | |
free(node->url); | |
free(node->title); | |
free(node->alt_text); | |
for (size_t i = 0; i < node->children_count; ++i) { | |
md_node_free(node->children[i]); | |
} | |
free(node->children); | |
free(node); | |
} | |
// --- Line Buffer for Block Parsing --- | |
typedef struct { | |
char** lines; | |
int count; | |
int capacity; | |
} LineBuffer; | |
void init_line_buffer(LineBuffer* buf) { | |
buf->lines = NULL; | |
buf->count = 0; | |
buf->capacity = 0; | |
} | |
void add_line_to_buffer(LineBuffer* buf, const char* line) { | |
if (buf->count >= buf->capacity) { | |
buf->capacity = buf->capacity == 0 ? 16 : buf->capacity * 2; | |
char** new_lines_arr = (char**)realloc(buf->lines, buf->capacity * sizeof(char*)); | |
if (!new_lines_arr) { | |
perror("Failed to realloc line buffer"); | |
return; // Data loss, but trying to continue | |
} | |
buf->lines = new_lines_arr; | |
} | |
buf->lines[buf->count++] = md_strdup(line); | |
} | |
void free_line_buffer(LineBuffer* buf) { | |
for (int i = 0; i < buf->count; ++i) { | |
free(buf->lines[i]); | |
} | |
free(buf->lines); | |
init_line_buffer(buf); // Reset to initial state | |
} | |
// --- Forward Declarations for Parsers --- | |
void parse_inlines_recursive(MDNode* parent_node, const char* text_start, const char* text_end); | |
void parse_inlines(MDNode* parent_node, const char* text); | |
MDNode* parse_blocks(const char** lines, int num_lines, int* current_line_index, MDNode* parent_container); | |
MDNode* parse_horizontal_rule(const char* line); | |
// --- Inline Parsing --- | |
const char* find_next_unescaped(const char* s, const char* text_end, char c) { | |
while (s < text_end) { | |
if (*s == '\\' && (s + 1) < text_end && *(s + 1) == c) { | |
s += 2; | |
} else if (*s == c) { | |
return s; | |
} else { | |
s++; | |
} | |
} | |
return NULL; | |
} | |
const char* find_matching_delimiter(const char* start, const char* text_end, const char* marker, int marker_len) { | |
const char* p = start + marker_len; | |
while (p <= text_end - marker_len) { // Ensure space for marker | |
if (strncmp(p, marker, marker_len) == 0) { | |
if (p > start && *(p - 1) == '\\') { // Escaped | |
p += marker_len; | |
continue; | |
} | |
return p; | |
} | |
p++; | |
} | |
return NULL; | |
} | |
void parse_inlines_recursive(MDNode* parent_node, const char* text_start, const char* text_end) { | |
if (text_start >= text_end) return; | |
const char* p = text_start; | |
const char* current_segment_start = text_start; | |
while (p < text_end) { | |
MDNode* inline_node = NULL; | |
const char* next_p = p; // Store where p should jump to | |
// 1. Escaped characters | |
if (*p == '\\' && (p + 1) < text_end && strchr("*_`~[]()#+-.<>!", *(p + 1))) { | |
if (p > current_segment_start) { | |
MDNode* text_node = md_node_new(MD_NODE_TEXT); | |
text_node->text_content = md_strndup(current_segment_start, p - current_segment_start); | |
md_node_add_child(parent_node, text_node); | |
} | |
inline_node = md_node_new(MD_NODE_ESCAPED_CHAR); | |
inline_node->text_content = md_strndup(p + 1, 1); | |
md_node_add_child(parent_node, inline_node); | |
next_p = p + 2; | |
} | |
// 2. Images:  | |
else if (*p == '!' && (p + 1) < text_end && *(p + 1) == '[') { | |
const char* alt_text_start = p + 2; | |
const char* alt_text_end = find_next_unescaped(alt_text_start, text_end, ']'); | |
if (alt_text_end && (alt_text_end + 1) < text_end && *(alt_text_end + 1) == '(') { | |
const char* url_start = alt_text_end + 2; | |
const char* url_end_search = url_start; | |
int paren_balance = 1; | |
while(url_end_search < text_end && paren_balance > 0) { | |
if (*url_end_search == '\\' && (url_end_search+1) < text_end) { url_end_search += 2; continue; } | |
if (*url_end_search == '(') paren_balance++; | |
else if (*url_end_search == ')') paren_balance--; | |
if (paren_balance == 0) break; | |
url_end_search++; | |
} | |
if (url_end_search < text_end && *url_end_search == ')') { | |
if (p > current_segment_start) { | |
MDNode* text_node = md_node_new(MD_NODE_TEXT); | |
text_node->text_content = md_strndup(current_segment_start, p - current_segment_start); | |
md_node_add_child(parent_node, text_node); | |
} | |
inline_node = md_node_new(MD_NODE_IMAGE); | |
inline_node->alt_text = md_strndup(alt_text_start, alt_text_end - alt_text_start); | |
const char* link_content_ptr = url_start; | |
const char* actual_url_end = url_end_search; | |
const char* title_start_ptr = NULL; | |
// Check for title within (url "title") | |
// Title must be at the end, enclosed in " or ' or () | |
// Simplified: look for space then " or ' for title. | |
const char* temp_title_search = url_start; | |
while(temp_title_search < url_end_search) { | |
if (isspace((unsigned char)*temp_title_search) && (temp_title_search + 1) < url_end_search) { | |
char quote_char = *(temp_title_search + 1); | |
if (quote_char == '"' || quote_char == '\'') { | |
const char* t_start = temp_title_search + 2; | |
const char* t_end = t_start; | |
while(t_end < url_end_search && *t_end != quote_char) { | |
if (*t_end == '\\' && (t_end+1) < url_end_search) t_end++; | |
t_end++; | |
} | |
if (t_end < url_end_search && *t_end == quote_char) { // Found title | |
inline_node->title = md_strndup(t_start, t_end - t_start); | |
actual_url_end = temp_title_search; // URL ends before space leading to title | |
break; | |
} | |
} | |
} | |
temp_title_search++; | |
} | |
char* temp_url = md_strndup(link_content_ptr, actual_url_end - link_content_ptr); | |
inline_node->url = md_strdup(trim_trailing_whitespace(temp_url)); // Also trim leading just in case | |
free(temp_url); | |
md_node_add_child(parent_node, inline_node); | |
next_p = url_end_search + 1; | |
} | |
} | |
} | |
// 3. Links: [text](url "title") | |
else if (*p == '[') { | |
const char* text_s = p + 1; | |
const char* text_e = find_next_unescaped(text_s, text_end, ']'); | |
if (text_e && (text_e + 1) < text_end && *(text_e + 1) == '(') { | |
const char* url_s = text_e + 2; | |
const char* url_e_search = url_s; | |
int bal = 1; | |
while(url_e_search < text_end && bal > 0) { | |
if (*url_e_search == '\\' && (url_e_search+1) < text_end) { url_e_search += 2; continue; } | |
if (*url_e_search == '(') bal++; | |
else if (*url_e_search == ')') bal--; | |
if (bal == 0) break; | |
url_e_search++; | |
} | |
if (url_e_search < text_end && *url_e_search == ')') { | |
if (p > current_segment_start) { | |
MDNode* text_node = md_node_new(MD_NODE_TEXT); | |
text_node->text_content = md_strndup(current_segment_start, p - current_segment_start); | |
md_node_add_child(parent_node, text_node); | |
} | |
inline_node = md_node_new(MD_NODE_LINK); | |
const char* link_content_ptr = url_s; | |
const char* actual_url_end = url_e_search; | |
// Title parsing logic (same as for image) | |
const char* temp_title_search = url_s; | |
while(temp_title_search < url_e_search) { | |
if (isspace((unsigned char)*temp_title_search) && (temp_title_search + 1) < url_e_search) { | |
char quote_char = *(temp_title_search + 1); | |
if (quote_char == '"' || quote_char == '\'') { | |
const char* t_start = temp_title_search + 2; | |
const char* t_end = t_start; | |
while(t_end < url_e_search && *t_end != quote_char) { | |
if (*t_end == '\\' && (t_end+1) < url_e_search) t_end++; | |
t_end++; | |
} | |
if (t_end < url_e_search && *t_end == quote_char) { | |
inline_node->title = md_strndup(t_start, t_end - t_start); | |
actual_url_end = temp_title_search; | |
break; | |
} | |
} | |
} | |
temp_title_search++; | |
} | |
char* temp_url = md_strndup(link_content_ptr, actual_url_end - link_content_ptr); | |
inline_node->url = md_strdup(trim_trailing_whitespace(temp_url)); | |
free(temp_url); | |
md_node_add_child(parent_node, inline_node); | |
parse_inlines_recursive(inline_node, text_s, text_e); // Parse link text | |
next_p = url_e_search + 1; | |
} | |
} | |
} | |
// Strong (**, __) | |
else if ((*p == '*' && (p + 1) < text_end && *(p + 1) == '*') || | |
(*p == '_' && (p + 1) < text_end && *(p + 1) == '_')) { | |
char marker[3]; marker[0] = *p; marker[1] = *p; marker[2] = '\0'; | |
const char* end_marker = find_matching_delimiter(p, text_end, marker, 2); | |
if (end_marker) { | |
if (p > current_segment_start) { | |
MDNode* text_node = md_node_new(MD_NODE_TEXT); | |
text_node->text_content = md_strndup(current_segment_start, p - current_segment_start); | |
md_node_add_child(parent_node, text_node); | |
} | |
inline_node = md_node_new(MD_NODE_STRONG); | |
md_node_add_child(parent_node, inline_node); | |
parse_inlines_recursive(inline_node, p + 2, end_marker); | |
next_p = end_marker + 2; | |
} | |
} | |
// Emphasis (*, _) | |
else if (*p == '*' || *p == '_') { | |
char marker[2]; marker[0] = *p; marker[1] = '\0'; | |
// Basic emphasis, not considering complex CommonMark intra-word rules for '_' | |
const char* end_marker = find_matching_delimiter(p, text_end, marker, 1); | |
if (end_marker && end_marker > p + 1) { // Not empty, e.g. **, __ or *p | |
// Avoid triggering on internal underscores in words if marker is '_' (simplified) | |
bool allow = true; | |
if (*p == '_' && end_marker + 1 < text_end && isalnum((unsigned char)*(end_marker+1)) && isalnum((unsigned char)*(p-1))) { | |
// allow = false; // crude attempt to prevent word_emphasis_word | |
} | |
if(allow) { | |
if (p > current_segment_start) { | |
MDNode* text_node = md_node_new(MD_NODE_TEXT); | |
text_node->text_content = md_strndup(current_segment_start, p - current_segment_start); | |
md_node_add_child(parent_node, text_node); | |
} | |
inline_node = md_node_new(MD_NODE_EMPHASIS); | |
md_node_add_child(parent_node, inline_node); | |
parse_inlines_recursive(inline_node, p + 1, end_marker); | |
next_p = end_marker + 1; | |
} | |
} | |
} | |
// Strikethrough (~~) | |
else if (*p == '~' && (p + 1) < text_end && *(p + 1) == '~') { | |
const char* end_marker = find_matching_delimiter(p, text_end, "~~", 2); | |
if (end_marker) { | |
if (p > current_segment_start) { | |
MDNode* text_node = md_node_new(MD_NODE_TEXT); | |
text_node->text_content = md_strndup(current_segment_start, p - current_segment_start); | |
md_node_add_child(parent_node, text_node); | |
} | |
inline_node = md_node_new(MD_NODE_STRIKETHROUGH); | |
md_node_add_child(parent_node, inline_node); | |
parse_inlines_recursive(inline_node, p + 2, end_marker); | |
next_p = end_marker + 2; | |
} | |
} | |
// Inline Code (`) - simplified, doesn't handle `` code ``, etc. | |
else if (*p == '`') { | |
const char* end_marker = find_next_unescaped(p + 1, text_end, '`'); | |
if (end_marker) { | |
if (p > current_segment_start) { | |
MDNode* text_node = md_node_new(MD_NODE_TEXT); | |
text_node->text_content = md_strndup(current_segment_start, p - current_segment_start); | |
md_node_add_child(parent_node, text_node); | |
} | |
inline_node = md_node_new(MD_NODE_INLINE_CODE); | |
const char* code_s = p + 1; | |
const char* code_e = end_marker; | |
size_t code_len = code_e - code_s; | |
// Trim one leading/trailing space if content is not all spaces and starts/ends with space | |
if (code_len >= 2 && *code_s == ' ' && *(code_e - 1) == ' ') { | |
bool all_spaces = true; | |
for(size_t i=0; i<code_len; ++i) if (code_s[i] != ' ') { all_spaces = false; break;} | |
if (!all_spaces) { | |
code_s++; code_len -=2; | |
} | |
} | |
inline_node->text_content = md_strndup(code_s, code_len); | |
md_node_add_child(parent_node, inline_node); | |
next_p = end_marker + 1; | |
} | |
} | |
// Autolinks <http://...> or <mailto:...> and basic HTML tags | |
else if (*p == '<') { | |
bool is_autolink_uri = false; | |
if ((text_end - (p+1)) >= 7 && (strncmp(p+1, "http://", 7) == 0 || strncmp(p+1, "https://", 8) == 0 || strncmp(p+1, "mailto:", 7) == 0)) { | |
is_autolink_uri = true; | |
} else if ((text_end - (p+1)) >= 6 && strncmp(p+1, "ftp://", 6) == 0) { | |
is_autolink_uri = true; | |
} | |
if (is_autolink_uri) { | |
const char* end_autolink = find_next_unescaped(p + 1, text_end, '>'); | |
if (end_autolink) { | |
if (p > current_segment_start) { | |
MDNode* text_node = md_node_new(MD_NODE_TEXT); | |
text_node->text_content = md_strndup(current_segment_start, p - current_segment_start); | |
md_node_add_child(parent_node, text_node); | |
} | |
inline_node = md_node_new(MD_NODE_LINK); | |
inline_node->url = md_strndup(p + 1, end_autolink - (p + 1)); | |
MDNode* text_child = md_node_new(MD_NODE_TEXT); // Autolink text is its URL | |
text_child->text_content = md_strdup(inline_node->url); | |
md_node_add_child(inline_node, text_child); | |
md_node_add_child(parent_node, inline_node); | |
next_p = end_autolink + 1; | |
} | |
} else if ((p+1) < text_end && (isalpha((unsigned char)*(p+1)) || *(p+1) == '/' || *(p+1) == '!')) { // Basic HTML tag start | |
const char* end_tag = strchr(p + 1, '>'); // Simple search for closing > | |
if (end_tag && end_tag < text_end) { | |
if (p > current_segment_start) { | |
MDNode* text_node = md_node_new(MD_NODE_TEXT); | |
text_node->text_content = md_strndup(current_segment_start, p - current_segment_start); | |
md_node_add_child(parent_node, text_node); | |
} | |
inline_node = md_node_new(MD_NODE_HTML_INLINE); | |
inline_node->text_content = md_strndup(p, (end_tag - p) + 1); | |
md_node_add_child(parent_node, inline_node); | |
next_p = end_tag + 1; | |
} | |
} | |
} | |
if (inline_node) { // An inline element was parsed | |
p = next_p; | |
current_segment_start = p; | |
} else { // No inline element started at *p, advance | |
p++; | |
} | |
} | |
// Add any remaining text as a plain text node | |
if (p > current_segment_start) { | |
MDNode* text_node = md_node_new(MD_NODE_TEXT); | |
text_node->text_content = md_strndup(current_segment_start, p - current_segment_start); | |
md_node_add_child(parent_node, text_node); | |
} | |
} | |
void parse_inlines(MDNode* parent_block_node, const char* text) { | |
if (!text || !*text) return; | |
parse_inlines_recursive(parent_block_node, text, text + strlen(text)); | |
} | |
// --- Block Parsing --- | |
char is_list_item_line(const char* line, int* out_indent, const char** out_content_start, int* out_number_if_ordered, char* out_actual_marker_char) { | |
const char* p = line; | |
int current_indent = 0; | |
while (*p == ' ') { | |
p++; | |
current_indent++; | |
} | |
if (out_number_if_ordered) *out_number_if_ordered = 0; | |
const char* marker_start = p; | |
// Unordered list: *, -, + | |
if (*p == '*' || *p == '-' || *p == '+') { | |
p++; | |
if (*p == ' ' || *p == '\t' || *p == '\0' || (*p == '\n')) { // Check for space or end of line/string after marker | |
*out_indent = current_indent; | |
*out_content_start = p; | |
if (*p != '\0' && *p != '\n') (*out_content_start)++; // Skip one space/tab | |
while (**out_content_start == ' ' || **out_content_start == '\t') (*out_content_start)++; // Skip multiple spaces/tabs | |
if(out_actual_marker_char) *out_actual_marker_char = *marker_start; | |
return *marker_start; // Return the marker character itself | |
} | |
} | |
// Ordered list: 1., 1) | |
const char* q = p; | |
long num_val = 0; | |
char* end_num_ptr; | |
if (isdigit((unsigned char)*q)) { | |
num_val = strtol(q, &end_num_ptr, 10); | |
if (end_num_ptr > q) { // Number was parsed | |
q = end_num_ptr; | |
if (*q == '.' || *q == ')') { | |
p = q + 1; | |
if (*p == ' ' || *p == '\t' || *p == '\0' || *p == '\n') { | |
*out_indent = current_indent; | |
*out_content_start = p; | |
if (*p != '\0' && *p != '\n') (*out_content_start)++; | |
while (**out_content_start == ' ' || **out_content_start == '\t') (*out_content_start)++; | |
if (out_number_if_ordered) *out_number_if_ordered = (int)num_val; // strtol handles large numbers | |
if(out_actual_marker_char) *out_actual_marker_char = *q; | |
return *q; // Return '.' or ')' | |
} | |
} | |
} | |
} | |
return 0; // Not a list item | |
} | |
MDNode* parse_heading_atx(const char* line) { | |
int level = 0; | |
const char* p = line; // line is already trimmed of leading spaces by caller | |
while (*p == '#') { | |
level++; | |
p++; | |
} | |
if (level == 0 || level > 6) return NULL; | |
if (*p != ' ' && *p != '\t' && *p != '\0' && *p != '\n') return NULL; // Must have space or be empty after #s | |
while (*p == ' ' || *p == '\t') p++; // Skip spaces after #s | |
char* temp_content = md_strdup(p); | |
char* end = temp_content + strlen(temp_content) - 1; | |
while (end >= temp_content && (*end == ' ' || *end == '\t' || *end == '#')) { | |
if (*end == '#') { // Remove trailing '#'s only if they are preceded by space or more '#'s | |
char* check = end -1; | |
bool clear_to_remove_hash = (check < temp_content || *check == ' ' || *check == '\t' || *check == '#'); | |
if (clear_to_remove_hash) end--; else break; | |
} else { // It's a space/tab | |
end--; | |
} | |
} | |
*(end + 1) = '\0'; | |
MDNode* heading_node = md_node_new(MD_NODE_HEADING); | |
heading_node->heading_level = level; | |
parse_inlines(heading_node, temp_content); | |
free(temp_content); | |
return heading_node; | |
} | |
MDNode* parse_fenced_code_block(const char** lines, int num_lines, int* current_line_index) { | |
const char* first_line = lines[*current_line_index]; | |
const char* trimmed_first_line = trim_leading_whitespace(first_line); | |
int line_indent = count_leading_spaces(first_line); | |
char fence_char = trimmed_first_line[0]; | |
int fence_len = 0; | |
const char* p = trimmed_first_line; | |
while (*p == fence_char) { | |
fence_len++; | |
p++; | |
} | |
if (fence_len < 3) return NULL; | |
const char* lang_start = p; | |
while (*lang_start == ' ' || *lang_start == '\t') lang_start++; // Skip spaces before info string | |
char* lang_info = md_strdup(lang_start); | |
trim_trailing_whitespace(lang_info); | |
if (fence_char == '`' && strchr(lang_info, '`')) { // Info string cannot contain backticks for backtick fences | |
free(lang_info); return NULL; | |
} | |
MDNode* code_node = md_node_new(MD_NODE_FENCED_CODE_BLOCK); | |
if (strlen(lang_info) > 0) { | |
char* first_word_end = lang_info; | |
while(*first_word_end && !isspace((unsigned char)*first_word_end)) first_word_end++; | |
*first_word_end = '\0'; // Take only first word as language | |
code_node->code_language = md_strdup(lang_info); | |
} | |
free(lang_info); | |
(*current_line_index)++; | |
LineBuffer content_buffer; | |
init_line_buffer(&content_buffer); | |
while (*current_line_index < num_lines) { | |
const char* current_line_orig = lines[*current_line_index]; | |
const char* current_line_ptr = current_line_orig; | |
int current_indent = 0; | |
while(current_indent < line_indent && *current_line_ptr == ' ') { // CommonMark allows indent for closing fence | |
current_line_ptr++; | |
current_indent++; | |
} | |
const char* q = current_line_ptr; | |
int closing_fence_len = 0; | |
while(*q == fence_char) { | |
closing_fence_len++; | |
q++; | |
} | |
q = trim_leading_whitespace(q); | |
if (closing_fence_len >= fence_len && is_blank_line(q)) { | |
(*current_line_index)++; | |
break; | |
} | |
// Remove up to `line_indent` common indent from content lines | |
int content_line_indent = count_leading_spaces(current_line_orig); | |
int effective_indent_to_remove = (content_line_indent < line_indent) ? content_line_indent : line_indent; | |
add_line_to_buffer(&content_buffer, current_line_orig + effective_indent_to_remove); | |
(*current_line_index)++; | |
} | |
size_t total_len = 0; | |
for (int i = 0; i < content_buffer.count; ++i) { | |
total_len += strlen(content_buffer.lines[i]) + 1; | |
} | |
if (total_len > 0) { | |
code_node->code_content = (char*)malloc(total_len); | |
code_node->code_content[0] = '\0'; | |
for (int i = 0; i < content_buffer.count; ++i) { | |
strcat(code_node->code_content, content_buffer.lines[i]); | |
if (i < content_buffer.count - 1 || content_buffer.count > 0 ) { // Add \n unless it's the very last line and buffer is not empty | |
strcat(code_node->code_content, "\n"); | |
} | |
} | |
// Trim final newline if content buffer was not empty. | |
if (content_buffer.count > 0 && code_node->code_content[strlen(code_node->code_content)-1] == '\n') { | |
code_node->code_content[strlen(code_node->code_content)-1] = '\0'; | |
} | |
} else { | |
code_node->code_content = md_strdup(""); | |
} | |
free_line_buffer(&content_buffer); | |
return code_node; | |
} | |
MDNode* parse_code_block_indented(const char** lines, int num_lines, int* current_line_index) { | |
MDNode* code_node = md_node_new(MD_NODE_CODE_BLOCK); | |
LineBuffer content_buffer; | |
init_line_buffer(&content_buffer); | |
int consecutive_blank_lines_in_code = 0; | |
while (*current_line_index < num_lines) { | |
const char* line = lines[*current_line_index]; | |
if (is_blank_line(line)) { | |
// A blank line can be part of an indented code block. | |
// If we've already collected code lines, add this blank line (with minimal indent) | |
if (content_buffer.count > 0) { | |
// CommonMark: up to 4 spaces of indent are removed from blank lines. | |
int indent = count_leading_spaces(line); | |
add_line_to_buffer(&content_buffer, line + (indent < 4 ? indent : 4) ); | |
consecutive_blank_lines_in_code++; | |
(*current_line_index)++; | |
} else { | |
break; // Initial blank lines are not part of code block | |
} | |
} else { | |
int indent = count_leading_spaces(line); | |
if (indent >= 4) { | |
add_line_to_buffer(&content_buffer, line + 4); // Remove 4 spaces of indent | |
consecutive_blank_lines_in_code = 0; | |
(*current_line_index)++; | |
} else { | |
break; // Not indented enough, end of code block | |
} | |
} | |
} | |
// Trim trailing blank lines from content_buffer | |
while (content_buffer.count > 0 && is_blank_line(content_buffer.lines[content_buffer.count - 1])) { | |
free(content_buffer.lines[--content_buffer.count]); | |
} | |
if (content_buffer.count > 0) { | |
size_t total_len = 0; | |
for (int i = 0; i < content_buffer.count; ++i) { | |
total_len += strlen(content_buffer.lines[i]) + 1; // +1 for \n | |
} | |
code_node->code_content = (char*)malloc(total_len); | |
code_node->code_content[0] = '\0'; | |
for (int i = 0; i < content_buffer.count; ++i) { | |
strcat(code_node->code_content, content_buffer.lines[i]); | |
if (i < content_buffer.count - 1) { // Add \n between lines | |
strcat(code_node->code_content, "\n"); | |
} | |
} | |
} else { | |
// No actual content lines, possibly just blank lines that were trimmed. | |
// It's still a code block if parse_blocks decided it was. | |
code_node->code_content = md_strdup(""); | |
} | |
free_line_buffer(&content_buffer); | |
return code_node; | |
} | |
MDNode* parse_blockquote(const char** lines, int num_lines, int* current_line_index) { | |
MDNode* bq_node = md_node_new(MD_NODE_BLOCKQUOTE); | |
LineBuffer bq_content_lines; | |
init_line_buffer(&bq_content_lines); | |
bool first_line_in_bq = true; | |
while (*current_line_index < num_lines) { | |
const char* line = lines[*current_line_index]; | |
const char* p = line; | |
p = trim_leading_whitespace(p); // Remove leading spaces on the line itself first | |
if (*p == '>') { | |
p++; // Skip '>' | |
if (*p == ' ' || *p == '\t') p++; // Skip optional one space/tab after '>' | |
add_line_to_buffer(&bq_content_lines, p); | |
(*current_line_index)++; | |
first_line_in_bq = false; | |
} else if (!first_line_in_bq && !is_blank_line(line)) { | |
// Lazy continuation: line does not start with '>', but is not blank. | |
// This is allowed if the blockquote has already started. | |
add_line_to_buffer(&bq_content_lines, line); // Pass full line for recursive parsing | |
(*current_line_index)++; | |
} else { | |
// Blank line, or line not starting with '>' (and not lazy continuation). End of blockquote. | |
break; | |
} | |
} | |
if (bq_content_lines.count > 0) { | |
int temp_idx_bq = 0; // Index for recursive call within blockquote content | |
parse_blocks((const char**)bq_content_lines.lines, bq_content_lines.count, &temp_idx_bq, bq_node); | |
} | |
free_line_buffer(&bq_content_lines); | |
if (bq_node->children_count == 0 && bq_content_lines.count == 0) { // Nothing was added | |
md_node_free(bq_node); // If blockquote is empty, discard it | |
return NULL; | |
} | |
return bq_node; | |
} | |
MDNode* parse_list(const char** lines, int num_lines, int* current_line_index, int initial_list_indent) { | |
int item_indent, item_number; | |
const char* item_content_start; | |
char item_actual_marker; | |
char first_marker_char = is_list_item_line(lines[*current_line_index], &item_indent, &item_content_start, &item_number, &item_actual_marker); | |
MDNodeType list_type = (first_marker_char == '.' || first_marker_char == ')') ? MD_NODE_ORDERED_LIST : MD_NODE_UNORDERED_LIST; | |
MDNode* list_node = md_node_new(list_type); | |
if (list_type == MD_NODE_ORDERED_LIST) { | |
list_node->list_start_number = item_number; | |
} | |
list_node->list_item_marker = item_actual_marker; // Store the actual marker char (e.g. '*') for UL, or ('.', ')') for OL. | |
list_node->tight_list = true; // Assume tight, set to false if blank lines appear appropriately | |
int prev_item_end_line = -1; // Track end line of previous item to check for blank lines between items | |
while (*current_line_index < num_lines) { | |
const char* current_item_line_str = lines[*current_line_index]; | |
int current_item_indent, current_item_number; | |
const char* current_item_content_s; | |
char current_item_actual_marker; | |
char current_marker_type = is_list_item_line(current_item_line_str, ¤t_item_indent, ¤t_item_content_s, ¤t_item_number, ¤t_item_actual_marker); | |
if (!current_marker_type || current_item_indent < initial_list_indent) { | |
break; // Not a list item for this list, or indented less (ends current list) | |
} | |
bool same_list_type_check = (list_type == MD_NODE_ORDERED_LIST && (current_marker_type == '.' || current_marker_type == ')')) || | |
(list_type == MD_NODE_UNORDERED_LIST && (current_marker_type != '.' && current_marker_type != ')')); | |
// For unordered lists, marker type can change (- then *). For ordered, delimiter (. or )) should be consistent for same list (CommonMark rule). | |
// My `list_item_marker` on `list_node` stores the *first* item's marker. | |
// A stricter check for OL: if (list_type == MD_NODE_ORDERED_LIST && current_item_actual_marker != list_node->list_item_marker) break; | |
if (!same_list_type_check) { | |
break; // Type changed, so new list. | |
} | |
// Check for blank lines between this item and the previous one | |
if (prev_item_end_line != -1 && *current_line_index > prev_item_end_line) { | |
for (int i = prev_item_end_line; i < *current_line_index; ++i) { | |
if (is_blank_line(lines[i])) { | |
list_node->tight_list = false; | |
break; | |
} | |
} | |
} | |
MDNode* item_node = md_node_new(MD_NODE_LIST_ITEM); | |
item_node->list_item_marker = current_item_actual_marker; | |
if (list_type == MD_NODE_ORDERED_LIST && item_node->parent == list_node && list_node->children_count == 0) { | |
// This is the first item of an ordered list. Set its number. | |
// The list_node->list_start_number is already set. Item nodes don't typically store their own number. | |
} | |
LineBuffer item_content_lines_buf; | |
init_line_buffer(&item_content_lines_buf); | |
add_line_to_buffer(&item_content_lines_buf, current_item_content_s); // First line of item content | |
int item_first_line_idx = *current_line_index; | |
(*current_line_index)++; // Consumed item marker line | |
// Calculate required indentation for continuation lines of this item | |
// Content column is indent of first content char relative to line start | |
int content_column_for_item = current_item_content_s - current_item_line_str; | |
// Or, marker length + space (e.g. "- " is 2, "1. " is 3) | |
int marker_len_plus_space = (current_item_content_s - (current_item_line_str + current_item_indent)) + current_item_indent; | |
bool item_ended_with_blank_line = false; | |
bool item_contains_multiple_blocks_or_internal_blanks = false; | |
while (*current_line_index < num_lines) { | |
const char* next_line_str = lines[*current_line_index]; | |
int next_line_indent_val, dummy_num; const char* dummy_content; char dummy_marker; | |
char next_line_is_list_item_marker = is_list_item_line(next_line_str, &next_line_indent_val, &dummy_content, &dummy_num, &dummy_marker); | |
if (next_line_is_list_item_marker && next_line_indent_val >= initial_list_indent) { | |
break; // New sibling item starts, current item ends. | |
} | |
if (is_blank_line(next_line_str)) { | |
// A blank line. If it's followed by properly indented content, it's part of this item. | |
// It might make the list loose. | |
if (*current_line_index + 1 < num_lines) { | |
const char* after_blank_line = lines[*current_line_index + 1]; | |
int after_blank_indent = count_leading_spaces(after_blank_line); | |
char next_next_is_item = is_list_item_line(after_blank_line, &next_line_indent_val, &dummy_content, &dummy_num, &dummy_marker); | |
if (after_blank_indent >= marker_len_plus_space && !next_next_is_item) { | |
// Indented enough to continue item, and not a new item marker itself. | |
add_line_to_buffer(&item_content_lines_buf, ""); // Add the blank line | |
item_ended_with_blank_line = true; // Mark that a blank line was consumed for this item | |
item_contains_multiple_blocks_or_internal_blanks = true; | |
} else { | |
break; // Blank line not part of this item (e.g. separates from next block or ends list) | |
} | |
} else { // Blank line at EOF | |
break; | |
} | |
} else { // Not a blank line | |
int current_content_line_indent = count_leading_spaces(next_line_str); | |
if (current_content_line_indent >= marker_len_plus_space) { // Properly indented continuation line | |
// Add line with indentation removed up to marker_len_plus_space | |
add_line_to_buffer(&item_content_lines_buf, next_line_str + marker_len_plus_space); | |
item_ended_with_blank_line = false; // Reset if content follows a blank | |
} else { | |
break; // Not indented enough to be part of this item. | |
} | |
} | |
(*current_line_index)++; | |
} | |
// Parse collected content for this item | |
int temp_item_idx = 0; | |
parse_blocks((const char**)item_content_lines_buf.lines, item_content_lines_buf.count, &temp_item_idx, item_node); | |
free_line_buffer(&item_content_lines_buf); | |
if (item_node->children_count > 1) { // Multiple blocks in item implies loose list. | |
item_contains_multiple_blocks_or_internal_blanks = true; | |
} | |
// If an item has internal blank lines that separate its blocks, or ends with blank lines | |
// that are part of its content, it contributes to a loose list. | |
if (item_contains_multiple_blocks_or_internal_blanks) { | |
list_node->tight_list = false; | |
} | |
md_node_add_child(list_node, item_node); | |
prev_item_end_line = *current_line_index; // current_line_index is now start of next line/item or EOF | |
} | |
if (list_node->children_count == 0) { // No items added | |
md_node_free(list_node); | |
return NULL; | |
} | |
return list_node; | |
} | |
MDNode* parse_horizontal_rule(const char* line) { | |
// line is already trimmed of leading spaces by caller | |
const char* p = line; | |
char c = *p; | |
if (c != '*' && c != '-' && c != '_') return NULL; | |
int count = 0; | |
while (*p) { | |
if (*p == c) { | |
count++; | |
} else if (!isspace((unsigned char)*p)) { | |
return NULL; // Non-marker, non-space char found | |
} | |
p++; | |
} | |
if (count < 3) return NULL; // Needs at least 3 markers | |
return md_node_new(MD_NODE_HORIZONTAL_RULE); | |
} | |
MDNode* parse_blocks(const char** lines, int num_lines, int* current_line_index, MDNode* parent_container) { | |
LineBuffer paragraph_buffer; | |
init_line_buffer(¶graph_buffer); | |
while (*current_line_index < num_lines) { | |
const char* line_orig = lines[*current_line_index]; | |
const char* line_trimmed_leading = trim_leading_whitespace(line_orig); | |
int initial_indent_spaces = count_leading_spaces(line_orig); | |
// 1. Try Setext heading if paragraph_buffer has content | |
if (paragraph_buffer.count > 0) { | |
int setext_level = 0; | |
const char* p_setext = line_trimmed_leading; | |
char setext_char_test = *p_setext; | |
if (setext_char_test == '=' || setext_char_test == '-') { | |
while (*p_setext == setext_char_test) p_setext++; | |
if (is_blank_line(p_setext)) { // Line consists only of '=' or '-' | |
bool is_hr_candidate = false; | |
if (setext_char_test == '-') { | |
MDNode* hr_test = parse_horizontal_rule(line_trimmed_leading); | |
if (hr_test) { is_hr_candidate = true; md_node_free(hr_test); } | |
} | |
if (!is_hr_candidate || setext_char_test == '=') { // '=' underlines are not HRs | |
setext_level = (setext_char_test == '=') ? 1 : 2; | |
} | |
} | |
} | |
if (setext_level > 0) { | |
MDNode* heading_node = md_node_new(MD_NODE_HEADING); | |
heading_node->heading_level = setext_level; | |
size_t total_para_len = 0; | |
for (int i = 0; i < paragraph_buffer.count; ++i) total_para_len += strlen(paragraph_buffer.lines[i]) + 1; | |
char* full_text = (char*)malloc(total_para_len + 1); // +1 for null terminator | |
if(full_text) { | |
full_text[0] = '\0'; | |
for (int i = 0; i < paragraph_buffer.count; ++i) { | |
strcat(full_text, paragraph_buffer.lines[i]); | |
if (i < paragraph_buffer.count - 1) strcat(full_text, " "); // Join with space | |
} | |
parse_inlines(heading_node, full_text); | |
free(full_text); | |
} | |
md_node_add_child(parent_container, heading_node); | |
free_line_buffer(¶graph_buffer); | |
(*current_line_index)++; // Consume the setext marker line | |
continue; | |
} | |
} | |
// 2. Flush paragraph if current line starts a new block type or is blank | |
bool line_starts_new_non_para_block = false; | |
if (*line_trimmed_leading == '#') line_starts_new_non_para_block = true; // ATX Heading | |
else if (strncmp(line_trimmed_leading, "```", 3) == 0 || strncmp(line_trimmed_leading, "~~~", 3) == 0) line_starts_new_non_para_block = true; // Fenced Code | |
else if (parse_horizontal_rule(line_trimmed_leading) != NULL) { // HR | |
MDNode* hr_test = parse_horizontal_rule(line_trimmed_leading); // Need to free this test node | |
if (hr_test) { line_starts_new_non_para_block = true; md_node_free(hr_test); } | |
} | |
else if (*line_trimmed_leading == '>') line_starts_new_non_para_block = true; // Blockquote | |
else if (is_list_item_line(line_orig, &(int){0}, &(const char*){NULL}, &(int){0}, &(char){0})) line_starts_new_non_para_block = true; // List item | |
else if (paragraph_buffer.count == 0 && initial_indent_spaces >= 4 && !is_blank_line(line_orig)) line_starts_new_non_para_block = true; // Indented code, if para empty | |
if (paragraph_buffer.count > 0 && (line_starts_new_non_para_block || is_blank_line(line_orig))) { | |
MDNode* para_node = md_node_new(MD_NODE_PARAGRAPH); | |
for (int i = 0; i < paragraph_buffer.count; ++i) { | |
char* current_para_line = paragraph_buffer.lines[i]; // This is already trimmed leading | |
size_t len = strlen(current_para_line); | |
bool hard_break = (len >= 2 && current_para_line[len-1] == ' ' && current_para_line[len-2] == ' '); | |
char* text_to_parse_inlines = md_strdup(current_para_line); | |
if (hard_break) text_to_parse_inlines[len-2] = '\0'; // Remove trailing spaces for inline parsing | |
parse_inlines(para_node, text_to_parse_inlines); | |
free(text_to_parse_inlines); | |
if (hard_break) { | |
md_node_add_child(para_node, md_node_new(MD_NODE_HARD_BREAK)); | |
} else if (i < paragraph_buffer.count - 1) { | |
md_node_add_child(para_node, md_node_new(MD_NODE_SOFT_BREAK)); | |
} | |
} | |
if(para_node->children_count > 0) md_node_add_child(parent_container, para_node); | |
else md_node_free(para_node); // Empty paragraph | |
free_line_buffer(¶graph_buffer); | |
} | |
// 3. Process current line based on its type | |
if (is_blank_line(line_orig)) { | |
(*current_line_index)++; | |
continue; | |
} | |
MDNode* new_block_node = NULL; | |
if (*line_trimmed_leading == '#') { // ATX Heading | |
new_block_node = parse_heading_atx(line_trimmed_leading); | |
if (new_block_node) (*current_line_index)++; | |
} else if (strncmp(line_trimmed_leading, "```", 3) == 0 || strncmp(line_trimmed_leading, "~~~", 3) == 0) { // Fenced Code | |
new_block_node = parse_fenced_code_block(lines, num_lines, current_line_index); // Advances index internally | |
} else { | |
MDNode* hr_try = parse_horizontal_rule(line_trimmed_leading); // HR | |
if (hr_try) { | |
new_block_node = hr_try; | |
(*current_line_index)++; | |
} | |
} | |
if (!new_block_node && *line_trimmed_leading == '>') { // Blockquote | |
new_block_node = parse_blockquote(lines, num_lines, current_line_index); // Advances index internally | |
} | |
if (!new_block_node && is_list_item_line(line_orig, &(int){0}, &(const char*){NULL}, &(int){0}, &(char){0})) { // List item | |
new_block_node = parse_list(lines, num_lines, current_line_index, initial_indent_spaces); // Advances index internally | |
} | |
if (!new_block_node && initial_indent_spaces >= 4 && paragraph_buffer.count == 0) { // Indented Code Block (only if not continuing a paragraph) | |
new_block_node = parse_code_block_indented(lines, num_lines, current_line_index); // Advances index internally | |
} | |
if (new_block_node) { | |
md_node_add_child(parent_container, new_block_node); | |
// Index already advanced by specific parsers or above | |
continue; | |
} | |
// 4. If nothing else, it's a paragraph line | |
// Line content for paragraph is trimmed of leading/trailing whitespace common to the block, | |
// but internal structure (like trailing spaces for hard break) is preserved. | |
char* para_line_content = md_strdup(line_trimmed_leading); | |
// Don't trim_trailing_whitespace here, as it might remove " " for hard break. | |
add_line_to_buffer(¶graph_buffer, para_line_content); | |
free(para_line_content); | |
(*current_line_index)++; | |
} | |
// Final flush of paragraph_buffer | |
if (paragraph_buffer.count > 0) { | |
MDNode* para_node = md_node_new(MD_NODE_PARAGRAPH); | |
for (int i = 0; i < paragraph_buffer.count; ++i) { | |
char* current_para_line = paragraph_buffer.lines[i]; | |
size_t len = strlen(current_para_line); | |
bool hard_break = (len >= 2 && current_para_line[len-1] == ' ' && current_para_line[len-2] == ' '); | |
char* text_to_parse_inlines = md_strdup(current_para_line); | |
if (hard_break) text_to_parse_inlines[len-2] = '\0'; | |
parse_inlines(para_node, text_to_parse_inlines); | |
free(text_to_parse_inlines); | |
if (hard_break) { | |
md_node_add_child(para_node, md_node_new(MD_NODE_HARD_BREAK)); | |
} else if (i < paragraph_buffer.count - 1) { | |
md_node_add_child(para_node, md_node_new(MD_NODE_SOFT_BREAK)); | |
} | |
} | |
if(para_node->children_count > 0) md_node_add_child(parent_container, para_node); | |
else md_node_free(para_node); | |
free_line_buffer(¶graph_buffer); | |
} | |
return parent_container; | |
} | |
MDNode* parse_document_from_lines(const char** lines, int num_lines) { | |
MDNode* doc = md_node_new(MD_NODE_DOCUMENT); | |
int current_line_idx = 0; | |
parse_blocks(lines, num_lines, ¤t_line_idx, doc); | |
return doc; | |
} | |
MDNode* parse_markdown(const char* markdown_text) { | |
LineBuffer line_buf; | |
init_line_buffer(&line_buf); | |
const char* p = markdown_text; | |
const char* line_start = p; | |
while (1) { | |
const char* line_end = strchr(line_start, '\n'); | |
bool last_line = (line_end == NULL); | |
size_t current_line_len = last_line ? strlen(line_start) : (line_end - line_start); | |
char* segment = md_strndup(line_start, current_line_len); | |
if (segment) { | |
char* expanded_segment = expand_tabs(segment); // Expand tabs | |
if (expanded_segment) { | |
add_line_to_buffer(&line_buf, expanded_segment); | |
free(expanded_segment); | |
} else { // Fallback if expand_tabs fails | |
add_line_to_buffer(&line_buf, segment); | |
} | |
free(segment); | |
} | |
if (last_line) break; | |
line_start = line_end + 1; | |
} | |
MDNode* doc = parse_document_from_lines((const char**)line_buf.lines, line_buf.count); | |
free_line_buffer(&line_buf); | |
return doc; | |
} | |
// --- AST Printer for Demo/Debug --- | |
void print_ast_node(MDNode* node, int indent_level) { | |
if (!node) return; | |
for (int i = 0; i < indent_level; ++i) printf(" "); | |
switch (node->type) { | |
case MD_NODE_DOCUMENT: printf("DOCUMENT\n"); break; | |
case MD_NODE_PARAGRAPH: printf("PARAGRAPH\n"); break; | |
case MD_NODE_HEADING: printf("HEADING (Level %d)\n", node->heading_level); break; | |
case MD_NODE_BLOCKQUOTE: printf("BLOCKQUOTE\n"); break; | |
case MD_NODE_UNORDERED_LIST: printf("UNORDERED_LIST (Marker: '%c', Tight: %s)\n", node->list_item_marker, node->tight_list ? "yes" : "no"); break; | |
case MD_NODE_ORDERED_LIST: printf("ORDERED_LIST (Start: %d, Marker: '%c', Tight: %s)\n", node->list_start_number, node->list_item_marker, node->tight_list ? "yes" : "no"); break; | |
case MD_NODE_LIST_ITEM: printf("LIST_ITEM (Marker: '%c')\n", node->list_item_marker); break; | |
case MD_NODE_CODE_BLOCK: printf("CODE_BLOCK (Indented)\n"); | |
if(node->code_content && strlen(node->code_content) < 60) { | |
for (int i = 0; i < indent_level + 1; ++i) printf(" "); printf("Content: [[%s]]\n", node->code_content); | |
} else if (node->code_content) { | |
for (int i = 0; i < indent_level + 1; ++i) printf(" "); printf("Content: (long)\n"); | |
} break; | |
case MD_NODE_FENCED_CODE_BLOCK: printf("FENCED_CODE_BLOCK (Lang: %s)\n", node->code_language ? node->code_language : "(none)"); | |
if(node->code_content && strlen(node->code_content) < 60) { | |
for (int i = 0; i < indent_level + 1; ++i) printf(" "); printf("Content: [[%s]]\n", node->code_content); | |
} else if (node->code_content) { | |
for (int i = 0; i < indent_level + 1; ++i) printf(" "); printf("Content: (long)\n"); | |
} break; | |
case MD_NODE_HORIZONTAL_RULE: printf("HORIZONTAL_RULE\n"); break; | |
case MD_NODE_HTML_BLOCK: printf("HTML_BLOCK: [[%s]]\n", node->text_content ? node->text_content : ""); break; | |
case MD_NODE_TEXT: printf("TEXT: \"%s\"\n", node->text_content ? node->text_content : ""); break; | |
case MD_NODE_EMPHASIS: printf("EMPHASIS\n"); break; | |
case MD_NODE_STRONG: printf("STRONG\n"); break; | |
case MD_NODE_STRIKETHROUGH: printf("STRIKETHROUGH\n"); break; | |
case MD_NODE_INLINE_CODE: printf("INLINE_CODE: `\"%s\"`\n", node->text_content ? node->text_content : ""); break; | |
case MD_NODE_LINK: printf("LINK (URL: %s, Title: %s)\n", node->url ? node->url : "", node->title ? node->title : ""); break; | |
case MD_NODE_IMAGE: printf("IMAGE (Alt: %s, URL: %s, Title: %s)\n", node->alt_text ? node->alt_text : "", node->url ? node->url : "", node->title ? node->title : ""); break; | |
case MD_NODE_SOFT_BREAK: printf("SOFT_BREAK\n"); break; | |
case MD_NODE_HARD_BREAK: printf("HARD_BREAK\n"); break; | |
case MD_NODE_HTML_INLINE: printf("HTML_INLINE: %s\n", node->text_content ? node->text_content : ""); break; | |
case MD_NODE_ESCAPED_CHAR: printf("ESCAPED_CHAR: \\%s\n", node->text_content ? node->text_content : ""); break; | |
default: printf("UNKNOWN_NODE (%d)\n", node->type); break; | |
} | |
for (size_t i = 0; i < node->children_count; ++i) { | |
print_ast_node(node->children[i], indent_level + 1); | |
} | |
} | |
// --- Main Function (Example Usage) --- | |
int main() { | |
const char* markdown_example = | |
"# Welcome to Markdown\n\n" | |
"This is a paragraph with *italic* and **bold** text.\n" | |
"And a line with two spaces at the end for a hard break. \n" | |
"Next line.\n\n" | |
"Another paragraph with `inline code` and a [link](http://example.com \"Optional Title\").\n" | |
"An image: \n\n" | |
"> This is a blockquote.\n" | |
"> With multiple lines.\n\n" | |
"And a lazy continuation\nfor the blockquote.\n\n" | |
"## Sub Heading\n\n" | |
"Setext L1\n" | |
"=========\n\n" | |
"Setext L2\n" | |
"---------\n\n" | |
"Indented code block:\n\n" | |
" int main() {\n" | |
" printf(\"Hello\");\n" | |
" }\n\n" | |
"Fenced code block:\n" | |
"```c\n" | |
"void func() {\n" | |
" // comment\n" | |
"}\n" | |
"```\n\n" | |
"Unordered List (tight):\n" | |
"- Item 1\n" | |
" - Nested Item 1.1\n" | |
" - Nested Item 1.2\n" | |
"- Item 2\n" | |
" * With more indent\n\n" | |
"Ordered List (tight):\n" | |
"1. First\n" | |
"2. Second\n" | |
" 1) Nested first (marker change)\n" | |
" 2) Nested second\n" | |
"3. Third\n\n" | |
"* Loose list item 1\n\n" | |
"* Loose list item 2\n" | |
" This is content for item 2.\n\n" | |
" Still item 2, after blank line.\n\n" | |
"* Loose list item 3\n\n" | |
"Horizontal Rule:\n" | |
"---\n\n" | |
"Escaped chars: \\*hello\\* \\`code\\` \\[link\\]\n" | |
"Inline HTML: <custom-tag attr=\"val\">content</custom-tag> also <br/> this is text.\n" | |
"Autolink: <http://google.com>\n" | |
"Not an autolink: <[email protected]> (mailto: is required)\n" | |
"Autolink mail: <mailto:[email protected]>\n"; | |
printf("Parsing Markdown:\n%s\n", markdown_example); | |
MDNode* doc = parse_markdown(markdown_example); | |
if (doc) { | |
printf("\nAST Structure:\n"); | |
print_ast_node(doc, 0); | |
md_node_free(doc); | |
} else { | |
printf("Failed to parse document.\n"); | |
} | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This is a translation of md.c | |
const std = @import("std"); | |
const Allocator = std.mem.Allocator; | |
const ArrayList = std.ArrayList; | |
const print = std.debug.print; | |
const INITIAL_CHILDREN_CAPACITY = 4; | |
const TAB_STOP_WIDTH = 4; | |
// --- Enums and Structs --- | |
const MdNodeType = enum { | |
Document, | |
Paragraph, | |
Heading, | |
Blockquote, | |
UnorderedList, | |
OrderedList, | |
ListItem, | |
CodeBlock, // Indented code block | |
FencedCodeBlock, | |
HorizontalRule, | |
HtmlBlock, // Basic support | |
// Inline elements | |
Text, | |
Emphasis, | |
Strong, | |
Strikethrough, | |
InlineCode, | |
Link, | |
Image, | |
SoftBreak, | |
HardBreak, | |
HtmlInline, // Basic support | |
EscapedChar, | |
}; | |
const MdNode = struct { | |
allocator: Allocator, | |
node_type: MdNodeType, | |
text_content: ?[]const u8 = null, // For TEXT, INLINE_CODE, HTML_INLINE/BLOCK, ESCAPED_CHAR | |
code_language: ?[]const u8 = null, // For FENCED_CODE_BLOCK | |
code_content: ?[]const u8 = null, // For CODE_BLOCK, FENCED_CODE_BLOCK | |
heading_level: u8 = 0, // For HEADING (1-6) | |
url: ?[]const u8 = null, // For LINK, IMAGE | |
title: ?[]const u8 = null, // For LINK, IMAGE (optional) | |
alt_text: ?[]const u8 = null, // For IMAGE | |
list_item_marker: u8 = 0, // For LIST_ITEM ('*', '-', '+', '.', ')') | |
list_start_number: usize = 0, // For ORDERED_LIST | |
tight_list: bool = false, // For lists | |
parent: ?*MdNode = null, // Optional | |
children: ArrayList(*MdNode), | |
pub fn create(allocator: Allocator, node_type: MdNodeType) !*MdNode { | |
const node = try allocator.create(MdNode); | |
node.* = MdNode{ | |
.allocator = allocator, | |
.node_type = node_type, | |
.children = ArrayList(*MdNode).init(allocator), | |
}; | |
return node; | |
} | |
pub fn deinit(self: *MdNode) void { | |
if (self.text_content) |tc| self.allocator.free(tc); | |
if (self.code_language) |cl| self.allocator.free(cl); | |
if (self.code_content) |cc| self.allocator.free(cc); | |
if (self.url) |u| self.allocator.free(u); | |
if (self.title) |t| self.allocator.free(t); | |
if (self.alt_text) |at| self.allocator.free(at); | |
for (self.children.items) |child_node| { | |
child_node.deinit(); | |
} | |
self.children.deinit(); | |
self.allocator.destroy(self); | |
} | |
pub fn addChild(self: *MdNode, child: *MdNode) !void { | |
try self.children.append(child); | |
child.parent = self; | |
} | |
}; | |
// --- Utility Functions --- | |
fn dupeSlice(allocator: Allocator, s: []const u8) ![]const u8 { | |
return try allocator.dupe(u8, s); | |
} | |
fn trimLeadingWhitespace(str: []const u8) []const u8 { | |
return std.mem.trimLeft(u8, str, " \t\r\n"); | |
} | |
fn trimTrailingWhitespace(str: []const u8) []const u8 { | |
return std.mem.trimRight(u8, str, " \t\r\n"); | |
} | |
fn trimWhitespace(str: []const u8) []const u8 { | |
return std.mem.trim(u8, str, " \t\r\n"); | |
} | |
fn countLeadingSpaces(line: []const u8) usize { | |
var count: usize = 0; | |
for (line) |char| { | |
if (char == ' ') { | |
count += 1; | |
} else { | |
break; | |
} | |
} | |
return count; | |
} | |
fn isBlankLine(line: []const u8) bool { | |
for (line) |char| { | |
if (!std.ascii.isSpace(char)) { | |
return false; | |
} | |
} | |
return true; | |
} | |
fn expandTabs(allocator: Allocator, line: []const u8) ![]u8 { | |
var new_line = ArrayList(u8).init(allocator); | |
errdefer new_line.deinit(); | |
var current_col: usize = 0; | |
for (line) |char| { | |
if (char == '\t') { | |
const spaces_to_add = TAB_STOP_WIDTH - (current_col % TAB_STOP_WIDTH); | |
var i: usize = 0; | |
while (i < spaces_to_add) : (i += 1) { | |
try new_line.append(' '); | |
} | |
current_col += spaces_to_add; | |
} else { | |
try new_line.append(char); | |
current_col += 1; | |
} | |
} | |
return new_line.toOwnedSlice(); | |
} | |
// --- Line Buffer for Block Parsing --- | |
const LineBuffer = struct { | |
allocator: Allocator, | |
lines: ArrayList([]const u8), // Owns the slices | |
pub fn init(allocator: Allocator) LineBuffer { | |
return LineBuffer{ | |
.allocator = allocator, | |
.lines = ArrayList([]const u8).init(allocator), | |
}; | |
} | |
pub fn deinit(self: *LineBuffer) void { | |
for (self.lines.items) |line| { | |
self.allocator.free(line); | |
} | |
self.lines.deinit(); | |
} | |
pub fn addLine(self: *LineBuffer, line: []const u8) !void { | |
try self.lines.append(try dupeSlice(self.allocator, line)); | |
} | |
pub fn getLines(self: *const LineBuffer) []const []const u8 { | |
return self.lines.items; | |
} | |
}; | |
// --- Forward Declarations for Parsers --- | |
// Using a struct with function pointers to break cyclic dependencies | |
const ParserFuncs = struct { | |
parseInlinesRecursiveFn: fn (allocator: Allocator, parent_node: *MdNode, text_start: []const u8) anyerror!void, | |
parseInlinesFn: fn (allocator: Allocator, parent_node: *MdNode, text: []const u8) anyerror!void, | |
parseBlocksFn: fn (allocator: Allocator, funcs: ParserFuncs, lines: []const []const u8, current_line_index: *usize, parent_container: *MdNode) anyerror!*MdNode, | |
parseHorizontalRuleFn: fn (line_trimmed: []const u8) ?*MdNode, | |
isListItemLineFn: fn (line: []const u8) ?ListItemInfo, | |
}; | |
const ListItemInfo = struct { | |
indent: usize, | |
content_start: []const u8, | |
number_if_ordered: usize, | |
actual_marker_char: u8, | |
marker_type: u8, // '*', '-', '+', '.', ')' | |
}; | |
// --- Inline Parsing --- | |
fn findNextUnescaped(s: []const u8, char_to_find: u8) ?usize { | |
var i: usize = 0; | |
while (i < s.len) { | |
if (s[i] == '\\' and (i + 1) < s.len and s[i + 1] == char_to_find) { | |
i += 2; | |
} else if (s[i] == char_to_find) { | |
return i; | |
} else { | |
i += 1; | |
} | |
} | |
return null; | |
} | |
fn findMatchingDelimiter(s: []const u8, marker: []const u8) ?usize { | |
// Start search after the initial marker occurrence which is s[0..marker.len] | |
var p_idx: usize = marker.len; | |
while (p_idx <= s.len - marker.len) { // Ensure space for marker | |
if (std.mem.startsWith(u8, s[p_idx..], marker)) { | |
// Check if it's an escaped marker, s[p_idx-1] | |
if (p_idx > 0 and s[p_idx - 1] == '\\') { | |
var backslashes: usize = 0; | |
var k = p_idx - 1; | |
while (k > 0 and s[k] == '\\') : (k -=1) { | |
backslashes += 1; | |
} | |
if (s[k] == '\\') backslashes +=1; // count s[0] if it's a backslash | |
if (backslashes % 2 == 1) { // Odd number of backslashes means marker is escaped | |
p_idx += marker.len; | |
continue; | |
} | |
} | |
return p_idx; | |
} | |
p_idx += 1; | |
} | |
return null; | |
} | |
fn parseInlinesRecursiveImpl(allocator: Allocator, funcs: ParserFuncs, parent_node: *MdNode, text_full: []const u8) !void { | |
if (text_full.len == 0) return; | |
var p_idx: usize = 0; | |
var current_segment_start_idx: usize = 0; | |
while (p_idx < text_full.len) { | |
var inline_node_created = false; | |
var next_p_idx: usize = p_idx; | |
const p_char = text_full[p_idx]; | |
// 1. Escaped characters | |
if (p_char == '\\' and (p_idx + 1) < text_full.len and std.mem.indexOfScalar(u8, "*_`~[]()#+-.<>!", text_full[p_idx + 1]) != null) { | |
if (p_idx > current_segment_start_idx) { | |
var text_node = try MdNode.create(allocator, .Text); | |
text_node.text_content = try dupeSlice(allocator, text_full[current_segment_start_idx..p_idx]); | |
try parent_node.addChild(text_node); | |
} | |
var escaped_node = try MdNode.create(allocator, .EscapedChar); | |
escaped_node.text_content = try dupeSlice(allocator, text_full[p_idx + 1 .. p_idx + 2]); | |
try parent_node.addChild(escaped_node); | |
inline_node_created = true; | |
next_p_idx = p_idx + 2; | |
} | |
// 2. Images:  | |
else if (p_char == '!' and (p_idx + 1) < text_full.len and text_full[p_idx + 1] == '[') { | |
const alt_text_start_idx = p_idx + 2; | |
if (findNextUnescaped(text_full[alt_text_start_idx..], ']')) |alt_text_end_rel_idx| { | |
const alt_text_end_idx = alt_text_start_idx + alt_text_end_rel_idx; | |
if ((alt_text_end_idx + 1) < text_full.len and text_full[alt_text_end_idx + 1] == '(') { | |
const url_overall_start_idx = alt_text_end_idx + 2; | |
var paren_balance: i32 = 1; | |
var url_overall_end_search_idx = url_overall_start_idx; | |
while (url_overall_end_search_idx < text_full.len and paren_balance > 0) { | |
if (text_full[url_overall_end_search_idx] == '\\' and (url_overall_end_search_idx + 1) < text_full.len) { | |
url_overall_end_search_idx += 2; continue; | |
} | |
if (text_full[url_overall_end_search_idx] == '(') paren_balance += 1 | |
else if (text_full[url_overall_end_search_idx] == ')') paren_balance -= 1; | |
if (paren_balance == 0) break; | |
url_overall_end_search_idx += 1; | |
} | |
if (url_overall_end_search_idx < text_full.len and text_full[url_overall_end_search_idx] == ')') { | |
if (p_idx > current_segment_start_idx) { | |
var text_node = try MdNode.create(allocator, .Text); | |
text_node.text_content = try dupeSlice(allocator, text_full[current_segment_start_idx..p_idx]); | |
try parent_node.addChild(text_node); | |
} | |
var image_node = try MdNode.create(allocator, .Image); | |
image_node.alt_text = try dupeSlice(allocator, text_full[alt_text_start_idx..alt_text_end_idx]); | |
const link_content_full = text_full[url_overall_start_idx..url_overall_end_search_idx]; | |
var actual_url_end_rel_idx: usize = link_content_full.len; | |
var title_search_idx: usize = 0; | |
while(title_search_idx < link_content_full.len) { | |
if (std.ascii.isSpace(link_content_full[title_search_idx]) and (title_search_idx + 1) < link_content_full.len) { | |
const quote_char = link_content_full[title_search_idx+1]; | |
if (quote_char == '"' or quote_char == '\'') { | |
const t_start_rel_idx = title_search_idx + 2; | |
var t_end_rel_idx = t_start_rel_idx; | |
while(t_end_rel_idx < link_content_full.len and link_content_full[t_end_rel_idx] != quote_char) { | |
if (link_content_full[t_end_rel_idx] == '\\' and (t_end_rel_idx+1) < link_content_full.len) t_end_rel_idx +=1; | |
t_end_rel_idx +=1; | |
} | |
if (t_end_rel_idx < link_content_full.len and link_content_full[t_end_rel_idx] == quote_char) { | |
image_node.title = try dupeSlice(allocator, link_content_full[t_start_rel_idx..t_end_rel_idx]); | |
actual_url_end_rel_idx = title_search_idx; | |
break; | |
} | |
} | |
} | |
title_search_idx += 1; | |
} | |
image_node.url = try dupeSlice(allocator, trimWhitespace(link_content_full[0..actual_url_end_rel_idx])); | |
try parent_node.addChild(image_node); | |
inline_node_created = true; | |
next_p_idx = url_overall_end_search_idx + 1; | |
} | |
} | |
} | |
} | |
// 3. Links: [text](url "title") | |
else if (p_char == '[') { | |
const text_s_idx = p_idx + 1; | |
if (findNextUnescaped(text_full[text_s_idx..], ']')) |text_e_rel_idx| { | |
const text_e_idx = text_s_idx + text_e_rel_idx; | |
if ((text_e_idx + 1) < text_full.len and text_full[text_e_idx + 1] == '(') { | |
const url_s_idx = text_e_idx + 2; | |
var bal: i32 = 1; | |
var url_e_search_idx = url_s_idx; | |
while (url_e_search_idx < text_full.len and bal > 0) { | |
if (text_full[url_e_search_idx] == '\\' and (url_e_search_idx + 1) < text_full.len) { | |
url_e_search_idx += 2; continue; | |
} | |
if (text_full[url_e_search_idx] == '(') bal += 1 | |
else if (text_full[url_e_search_idx] == ')') bal -= 1; | |
if (bal == 0) break; | |
url_e_search_idx += 1; | |
} | |
if (url_e_search_idx < text_full.len and text_full[url_e_search_idx] == ')') { | |
if (p_idx > current_segment_start_idx) { | |
var text_node = try MdNode.create(allocator, .Text); | |
text_node.text_content = try dupeSlice(allocator, text_full[current_segment_start_idx..p_idx]); | |
try parent_node.addChild(text_node); | |
} | |
var link_node = try MdNode.create(allocator, .Link); | |
const link_content_full = text_full[url_s_idx..url_e_search_idx]; | |
var actual_url_end_rel_idx: usize = link_content_full.len; | |
var title_search_idx: usize = 0; | |
while(title_search_idx < link_content_full.len) { | |
if (std.ascii.isSpace(link_content_full[title_search_idx]) and (title_search_idx + 1) < link_content_full.len) { | |
const quote_char = link_content_full[title_search_idx+1]; | |
if (quote_char == '"' or quote_char == '\'') { | |
const t_start_rel_idx = title_search_idx + 2; | |
var t_end_rel_idx = t_start_rel_idx; | |
while(t_end_rel_idx < link_content_full.len and link_content_full[t_end_rel_idx] != quote_char) { | |
if (link_content_full[t_end_rel_idx] == '\\' and (t_end_rel_idx+1) < link_content_full.len) t_end_rel_idx +=1; | |
t_end_rel_idx +=1; | |
} | |
if (t_end_rel_idx < link_content_full.len and link_content_full[t_end_rel_idx] == quote_char) { | |
link_node.title = try dupeSlice(allocator, link_content_full[t_start_rel_idx..t_end_rel_idx]); | |
actual_url_end_rel_idx = title_search_idx; | |
break; | |
} | |
} | |
} | |
title_search_idx += 1; | |
} | |
link_node.url = try dupeSlice(allocator, trimWhitespace(link_content_full[0..actual_url_end_rel_idx])); | |
try parent_node.addChild(link_node); | |
try funcs.parseInlinesRecursiveFn(allocator, funcs, link_node, text_full[text_s_idx..text_e_idx]); | |
inline_node_created = true; | |
next_p_idx = url_e_search_idx + 1; | |
} | |
} | |
} | |
} | |
// Strong (**, __) | |
else if ((p_char == '*' and (p_idx + 1) < text_full.len and text_full[p_idx + 1] == '*') or | |
(p_char == '_' and (p_idx + 1) < text_full.len and text_full[p_idx + 1] == '_')) { | |
const marker = text_full[p_idx .. p_idx + 2]; | |
if (findMatchingDelimiter(text_full[p_idx..], marker)) |end_marker_rel_idx| { | |
const end_marker_idx = p_idx + end_marker_rel_idx; | |
if (p_idx > current_segment_start_idx) { | |
var text_node = try MdNode.create(allocator, .Text); | |
text_node.text_content = try dupeSlice(allocator, text_full[current_segment_start_idx..p_idx]); | |
try parent_node.addChild(text_node); | |
} | |
const strong_node = try MdNode.create(allocator, .Strong); | |
try parent_node.addChild(strong_node); | |
try funcs.parseInlinesRecursiveFn(allocator, funcs, strong_node, text_full[p_idx + 2 .. end_marker_idx]); | |
inline_node_created = true; | |
next_p_idx = end_marker_idx + 2; | |
} | |
} | |
// Emphasis (*, _) | |
else if (p_char == '*' or p_char == '_') { | |
const marker = text_full[p_idx .. p_idx + 1]; | |
// Need to check that this is not part of a strong marker. | |
// E.g. in "**foo**", the first '*' should not match as emphasis. | |
// This is complex with CommonMark rules (left/right flanking, etc.) | |
// Simplified: if next char is same, it's probably strong, skip. | |
var is_part_of_strong = false; | |
if ((p_idx + 1) < text_full.len and text_full[p_idx+1] == p_char) { | |
is_part_of_strong = true; | |
} | |
if (!is_part_of_strong and findMatchingDelimiter(text_full[p_idx..], marker)) |end_marker_rel_idx| { | |
const end_marker_idx = p_idx + end_marker_rel_idx; | |
if (end_marker_idx > p_idx + 1) { // Not empty | |
if (p_idx > current_segment_start_idx) { | |
var text_node = try MdNode.create(allocator, .Text); | |
text_node.text_content = try dupeSlice(allocator, text_full[current_segment_start_idx..p_idx]); | |
try parent_node.addChild(text_node); | |
} | |
const emp_node = try MdNode.create(allocator, .Emphasis); | |
try parent_node.addChild(emp_node); | |
try funcs.parseInlinesRecursiveFn(allocator, funcs, emp_node, text_full[p_idx + 1 .. end_marker_idx]); | |
inline_node_created = true; | |
next_p_idx = end_marker_idx + 1; | |
} | |
} | |
} | |
// Strikethrough (~~) | |
else if (p_char == '~' and (p_idx + 1) < text_full.len and text_full[p_idx + 1] == '~') { | |
if (findMatchingDelimiter(text_full[p_idx..], "~~")) |end_marker_rel_idx| { | |
const end_marker_idx = p_idx + end_marker_rel_idx; | |
if (p_idx > current_segment_start_idx) { | |
var text_node = try MdNode.create(allocator, .Text); | |
text_node.text_content = try dupeSlice(allocator, text_full[current_segment_start_idx..p_idx]); | |
try parent_node.addChild(text_node); | |
} | |
const strike_node = try MdNode.create(allocator, .Strikethrough); | |
try parent_node.addChild(strike_node); | |
try funcs.parseInlinesRecursiveFn(allocator, funcs, strike_node, text_full[p_idx + 2 .. end_marker_idx]); | |
inline_node_created = true; | |
next_p_idx = end_marker_idx + 2; | |
} | |
} | |
// Inline Code (`) | |
else if (p_char == '`') { | |
// Basic: find next '`'. GFM allows multiple backticks. | |
var opening_backticks: usize = 1; | |
var ob_idx = p_idx + 1; | |
while(ob_idx < text_full.len and text_full[ob_idx] == '`') : (ob_idx+=1) { | |
opening_backticks +=1; | |
} | |
var cb_search_idx = ob_idx; | |
var end_marker_idx: ?usize = null; | |
while(cb_search_idx < text_full.len) { | |
if (text_full[cb_search_idx] == '`') { | |
var closing_backticks: usize = 1; | |
var cbt_idx = cb_search_idx + 1; | |
while(cbt_idx < text_full.len and text_full[cbt_idx] == '`') : (cbt_idx +=1) { | |
closing_backticks +=1; | |
} | |
if (closing_backticks == opening_backticks) { | |
end_marker_idx = cb_search_idx; | |
break; | |
} | |
cb_search_idx = cbt_idx; // Jump past these backticks | |
} else { | |
cb_search_idx +=1; | |
} | |
} | |
if (end_marker_idx) |em_idx| { | |
if (p_idx > current_segment_start_idx) { | |
var text_node = try MdNode.create(allocator, .Text); | |
text_node.text_content = try dupeSlice(allocator, text_full[current_segment_start_idx..p_idx]); | |
try parent_node.addChild(text_node); | |
} | |
var code_node = try MdNode.create(allocator, .InlineCode); | |
var code_slice = text_full[p_idx + opening_backticks .. em_idx]; | |
// Trim one leading/trailing space if content is not all spaces and starts/ends with space | |
// and if the code span is not empty after potential trimming | |
if (code_slice.len >= 2 and code_slice[0] == ' ' and code_slice[code_slice.len-1] == ' ') { | |
var all_spaces = true; | |
for (code_slice) |c| { | |
if (c != ' ') { | |
all_spaces = false; | |
break; | |
} | |
} | |
if (!all_spaces) { | |
code_slice = code_slice[1..code_slice.len-1]; | |
} | |
} else if (code_slice.len == 1 and code_slice[0] == ' '){ | |
// ` ` should be ` ` not `` | |
} | |
code_node.text_content = try dupeSlice(allocator, code_slice); | |
try parent_node.addChild(code_node); | |
inline_node_created = true; | |
next_p_idx = em_idx + opening_backticks; | |
} | |
} | |
// Autolinks <http://...> etc. and basic HTML tags | |
else if (p_char == '<') { | |
const remaining_text = text_full[p_idx + 1 ..]; | |
var is_autolink_uri = false; | |
if (std.mem.startsWith(u8, remaining_text, "http://") or | |
std.mem.startsWith(u8, remaining_text, "https://") or | |
std.mem.startsWith(u8, remaining_text, "mailto:") or | |
std.mem.startsWith(u8, remaining_text, "ftp://")) { | |
is_autolink_uri = true; | |
} | |
if (is_autolink_uri) { | |
if (findNextUnescaped(remaining_text, '>')) |end_autolink_rel_idx| { | |
const end_autolink_idx = (p_idx + 1) + end_autolink_rel_idx; | |
if (p_idx > current_segment_start_idx) { | |
var text_node = try MdNode.create(allocator, .Text); | |
text_node.text_content = try dupeSlice(allocator, text_full[current_segment_start_idx..p_idx]); | |
try parent_node.addChild(text_node); | |
} | |
var link_node = try MdNode.create(allocator, .Link); | |
link_node.url = try dupeSlice(allocator, text_full[p_idx + 1 .. end_autolink_idx]); | |
var text_child = try MdNode.create(allocator, .Text); // Autolink text is its URL | |
text_child.text_content = try dupeSlice(allocator, link_node.url.?); | |
try link_node.addChild(text_child); | |
try parent_node.addChild(link_node); | |
inline_node_created = true; | |
next_p_idx = end_autolink_idx + 1; | |
} | |
} | |
else if ((p_idx + 1) < text_full.len and (std.ascii.isAlpha(text_full[p_idx + 1]) or text_full[p_idx+1] == '/' or text_full[p_idx+1] == '!')) { // Basic HTML tag | |
if (std.mem.indexOfScalar(u8, text_full[p_idx+1..], '>')) |end_tag_rel_idx| { | |
const end_tag_idx = (p_idx+1) + end_tag_rel_idx; | |
if (p_idx > current_segment_start_idx) { | |
var text_node = try MdNode.create(allocator, .Text); | |
text_node.text_content = try dupeSlice(allocator, text_full[current_segment_start_idx..p_idx]); | |
try parent_node.addChild(text_node); | |
} | |
var html_node = try MdNode.create(allocator, .HtmlInline); | |
html_node.text_content = try dupeSlice(allocator, text_full[p_idx .. end_tag_idx + 1]); | |
try parent_node.addChild(html_node); | |
inline_node_created = true; | |
next_p_idx = end_tag_idx + 1; | |
} | |
} | |
} | |
if (inline_node_created) { | |
p_idx = next_p_idx; | |
current_segment_start_idx = p_idx; | |
} else { | |
p_idx += 1; | |
} | |
} | |
if (p_idx > current_segment_start_idx) { | |
var text_node = try MdNode.create(allocator, .Text); | |
text_node.text_content = try dupeSlice(allocator, text_full[current_segment_start_idx..p_idx]); | |
try parent_node.addChild(text_node); | |
} | |
} | |
fn parseInlinesImpl(allocator: Allocator, funcs: ParserFuncs, parent_block_node: *MdNode, text: []const u8) !void { | |
if (text.len == 0) return; | |
try funcs.parseInlinesRecursiveFn(allocator, funcs, parent_block_node, text); | |
} | |
// --- Block Parsing --- | |
fn isListItemLineImpl(line: []const u8) ?ListItemInfo { | |
var p_idx: usize = 0; | |
var current_indent: usize = 0; | |
while (p_idx < line.len and line[p_idx] == ' ') { | |
p_idx += 1; | |
current_indent += 1; | |
} | |
if (p_idx >= line.len) return null; | |
const marker_start_idx = p_idx; | |
const marker_char = line[p_idx]; | |
if (marker_char == '*' or marker_char == '-' or marker_char == '+') { | |
p_idx += 1; | |
if (p_idx >= line.len or std.ascii.isSpace(line[p_idx])) { | |
var content_start_idx = p_idx; | |
if (content_start_idx < line.len and line[content_start_idx] == ' ') content_start_idx += 1; | |
// CommonMark: up to 4 spaces after marker or 1 tab. Simplified to 1 space or end. | |
// Then skip more spaces before content. | |
while (content_start_idx < line.len and line[content_start_idx] == ' ' and (content_start_idx < p_idx + 4)) content_start_idx +=1; // up to 4 spaces, but not eating content | |
if (content_start_idx < line.len and line[content_start_idx] == '\t') content_start_idx +=1; // or one tab | |
return ListItemInfo{ | |
.indent = current_indent, | |
.content_start = line[content_start_idx..], | |
.number_if_ordered = 0, | |
.actual_marker_char = marker_char, | |
.marker_type = marker_char, | |
}; | |
} | |
} | |
if (std.ascii.isDigit(marker_char)) { | |
var num_end_idx = marker_start_idx; | |
while (num_end_idx < line.len and std.ascii.isDigit(line[num_end_idx])) { | |
num_end_idx += 1; | |
} | |
if (num_end_idx > marker_start_idx and num_end_idx < line.len and (line[num_end_idx] == '.' or line[num_end_idx] == ')')) { | |
const num_val = std.fmt.parseInt(usize, line[marker_start_idx..num_end_idx], 10) catch 0; | |
const ol_marker_char = line[num_end_idx]; | |
p_idx = num_end_idx + 1; | |
if (p_idx >= line.len or std.ascii.isSpace(line[p_idx])) { | |
var content_start_idx = p_idx; | |
if (content_start_idx < line.len and line[content_start_idx] == ' ') content_start_idx += 1; | |
while (content_start_idx < line.len and line[content_start_idx] == ' ' and (content_start_idx < p_idx + 4)) content_start_idx +=1; | |
if (content_start_idx < line.len and line[content_start_idx] == '\t') content_start_idx +=1; | |
return ListItemInfo{ | |
.indent = current_indent, | |
.content_start = line[content_start_idx..], | |
.number_if_ordered = num_val, | |
.actual_marker_char = ol_marker_char, | |
.marker_type = ol_marker_char, | |
}; | |
} | |
} | |
} | |
return null; | |
} | |
fn parseHeadingAtx(allocator: Allocator, funcs: ParserFuncs, line_trimmed: []const u8) !?*MdNode { | |
var level: u8 = 0; | |
var p_idx: usize = 0; | |
while (p_idx < line_trimmed.len and line_trimmed[p_idx] == '#') { | |
level += 1; | |
p_idx += 1; | |
} | |
if (level == 0 or level > 6) return null; | |
if (p_idx < line_trimmed.len and !std.ascii.isSpace(line_trimmed[p_idx])) return null; | |
while (p_idx < line_trimmed.len and std.ascii.isSpace(line_trimmed[p_idx])) p_idx += 1; | |
var content = line_trimmed[p_idx..]; | |
var end_idx = content.len; | |
while (end_idx > 0) { | |
const char_before_end = content[end_idx-1]; | |
if (char_before_end == '#') { | |
var can_remove_hash = true; | |
if (end_idx > 1) { // Check char before hash | |
const char_before_hash = content[end_idx-2]; | |
if (char_before_hash != ' ' and char_before_hash != '\t' and char_before_hash != '#') { | |
// Check for escaped hash | |
if (char_before_hash == '\\') { | |
var backslashes: usize = 0; | |
var k = end_idx - 2; | |
while(k > 0 and content[k] == '\\') : (k-=1) { backslashes +=1; } | |
if (content[k] == '\\') backslashes +=1; | |
if (backslashes % 2 == 1) can_remove_hash = false; // Escaped hash, don't remove | |
} else { | |
can_remove_hash = false; // Non-space, non-# char before hash | |
} | |
} | |
} | |
if (can_remove_hash) { | |
end_idx -= 1; | |
} else { | |
break; | |
} | |
} else if (std.ascii.isSpace(char_before_end)) { | |
end_idx -=1; | |
} else break; | |
} | |
content = content[0..end_idx]; | |
content = trimTrailingWhitespace(content); | |
var heading_node = try MdNode.create(allocator, .Heading); | |
heading_node.heading_level = level; | |
try funcs.parseInlinesFn(allocator, funcs, heading_node, content); | |
return heading_node; | |
} | |
fn parseFencedCodeBlock(allocator: Allocator, funcs: ParserFuncs, lines: []const []const u8, current_line_index: *usize) !?*MdNode { | |
_ = funcs; // Not used here | |
const first_line = lines[current_line_index.*]; | |
const line_indent = countLeadingSpaces(first_line); | |
const trimmed_first_line = first_line[line_indent..]; | |
if (trimmed_first_line.len < 3) return null; | |
const fence_char = trimmed_first_line[0]; | |
if (fence_char != '`' and fence_char != '~') return null; | |
var fence_len: usize = 0; | |
var p_idx: usize = 0; | |
while (p_idx < trimmed_first_line.len and trimmed_first_line[p_idx] == fence_char) { | |
fence_len += 1; | |
p_idx += 1; | |
} | |
if (fence_len < 3) return null; | |
const lang_start = trimLeadingWhitespace(trimmed_first_line[p_idx..]); | |
if (fence_char == '`' and std.mem.indexOfScalar(u8, lang_start, '`') != null) { | |
return null; | |
} | |
var code_node = try MdNode.create(allocator, .FencedCodeBlock); | |
if (lang_start.len > 0) { | |
var lang_info_trimmed = trimTrailingWhitespace(lang_start); | |
const first_word_end = std.mem.indexOfScalar(u8, lang_info_trimmed, ' ') orelse lang_info_trimmed.len; | |
code_node.code_language = try dupeSlice(allocator, lang_info_trimmed[0..first_word_end]); | |
} | |
current_line_index.* += 1; | |
var content_buffer = LineBuffer.init(allocator); | |
defer content_buffer.deinit(); | |
while (current_line_index.* < lines.len) { | |
const current_line_orig = lines[current_line_index.*]; | |
var current_line_ptr_idx: usize = 0; | |
var current_initial_indent: usize = 0; | |
while (current_initial_indent < line_indent and current_line_ptr_idx < current_line_orig.len and current_line_orig[current_line_ptr_idx] == ' ') { | |
current_line_ptr_idx += 1; | |
current_initial_indent += 1; | |
} | |
const current_line_maybe_fenced = current_line_orig[current_line_ptr_idx..]; | |
var closing_fence_len: usize = 0; | |
var q_idx: usize = 0; | |
while(q_idx < current_line_maybe_fenced.len and current_line_maybe_fenced[q_idx] == fence_char) { | |
closing_fence_len += 1; | |
q_idx += 1; | |
} | |
if (closing_fence_len >= fence_len and isBlankLine(trimLeadingWhitespace(current_line_maybe_fenced[q_idx..]))) { | |
current_line_index.* += 1; | |
break; | |
} | |
const content_line_leading_spaces = countLeadingSpaces(current_line_orig); | |
const effective_indent_to_remove = @min(content_line_leading_spaces, line_indent); | |
try content_buffer.addLine(current_line_orig[effective_indent_to_remove..]); | |
current_line_index.* += 1; | |
} | |
var joined_content = ArrayList(u8).init(allocator); | |
defer joined_content.deinit(); | |
for (content_buffer.getLines(), 0..) |line, i| { | |
try joined_content.appendSlice(line); | |
if (i < content_buffer.getLines().len - 1) { | |
try joined_content.append('\n'); | |
} | |
} | |
// GFM strips the *final* newline of the code block content, if one exists | |
if (joined_content.items.len > 0 and joined_content.items[joined_content.items.len-1] == '\n' and content_buffer.getLines().len > 0) { | |
code_node.code_content = try dupeSlice(allocator, joined_content.items[0..joined_content.items.len-1]); | |
} else { | |
code_node.code_content = try joined_content.toOwnedSlice(); | |
} | |
if (code_node.code_content.?.len == 0 and content_buffer.getLines().len == 0) { | |
code_node.code_content = try dupeSlice(allocator, ""); // Ensure it's not null | |
} | |
return code_node; | |
} | |
fn parseCodeBlockIndented(allocator: Allocator, funcs: ParserFuncs, lines: []const []const u8, current_line_index: *usize) !?*MdNode { | |
_ = funcs; | |
var code_node = try MdNode.create(allocator, .CodeBlock); | |
var content_buffer = LineBuffer.init(allocator); | |
defer content_buffer.deinit(); | |
while (current_line_index.* < lines.len) { | |
const line = lines[current_line_index.*]; | |
if (isBlankLine(line)) { | |
if (content_buffer.getLines().len > 0) { | |
const indent = countLeadingSpaces(line); | |
try content_buffer.addLine(line[@min(indent, TAB_STOP_WIDTH)..]); | |
current_line_index.* += 1; | |
} else { | |
break; | |
} | |
} else { | |
const indent = countLeadingSpaces(line); | |
if (indent >= TAB_STOP_WIDTH) { | |
try content_buffer.addLine(line[TAB_STOP_WIDTH..]); | |
current_line_index.* += 1; | |
} else { | |
break; | |
} | |
} | |
} | |
while (content_buffer.getLines().len > 0 and isBlankLine(content_buffer.getLines()[content_buffer.getLines().len - 1])) { | |
const last_line = content_buffer.lines.pop(); | |
allocator.free(last_line); | |
} | |
if (content_buffer.getLines().len > 0) { | |
var joined_code = ArrayList(u8).init(allocator); | |
defer joined_code.deinit(); | |
for (content_buffer.getLines(), 0..) |ln, i| { | |
try joined_code.appendSlice(ln); | |
if (i < content_buffer.getLines().len - 1) { | |
try joined_code.append('\n'); | |
} | |
} | |
code_node.code_content = try joined_code.toOwnedSlice(); | |
} else { | |
code_node.code_content = try dupeSlice(allocator, ""); | |
} | |
return code_node; | |
} | |
fn parseBlockquote(allocator: Allocator, funcs: ParserFuncs, lines: []const []const u8, current_line_index: *usize) !?*MdNode { | |
var bq_node = try MdNode.create(allocator, .Blockquote); | |
var bq_content_lines = LineBuffer.init(allocator); | |
var first_line_in_bq = true; | |
while (current_line_index.* < lines.len) { | |
const line = lines[current_line_index.*]; | |
const trimmed_line = trimLeadingWhitespace(line); | |
if (trimmed_line.len > 0 and trimmed_line[0] == '>') { | |
var content_after_marker = trimmed_line[1..]; | |
if (content_after_marker.len > 0 and (content_after_marker[0] == ' ' or content_after_marker[0] == '\t')) { | |
content_after_marker = content_after_marker[1..]; | |
} | |
try bq_content_lines.addLine(content_after_marker); | |
current_line_index.* += 1; | |
first_line_in_bq = false; | |
} else if (!first_line_in_bq and !isBlankLine(line)) { | |
try bq_content_lines.addLine(line); | |
current_line_index.* += 1; | |
} else { | |
break; | |
} | |
} | |
if (bq_content_lines.getLines().len > 0) { | |
var temp_idx_bq: usize = 0; | |
_ = try funcs.parseBlocksFn(allocator, funcs, bq_content_lines.getLines(), &temp_idx_bq, bq_node); | |
} | |
bq_content_lines.deinit(); | |
if (bq_node.children.items.len == 0 and bq_content_lines.getLines().len == 0) { // Check if anything meaningful was added | |
bq_node.deinit(); | |
return null; | |
} | |
return bq_node; | |
} | |
fn parseList(allocator: Allocator, funcs: ParserFuncs, lines: []const []const u8, current_line_index: *usize, initial_list_indent: usize) !?*MdNode { | |
const first_item_info = funcs.isListItemLineFn(lines[current_line_index.*]) orelse return null; | |
const list_type = if (first_item_info.marker_type == '.' or first_item_info.marker_type == ')') .OrderedList else .UnorderedList; | |
var list_node = try MdNode.create(allocator, list_type); | |
if (list_type == .OrderedList) { | |
list_node.list_start_number = first_item_info.number_if_ordered; | |
} | |
list_node.list_item_marker = first_item_info.actual_marker_char; | |
list_node.tight_list = true; | |
var prev_item_end_line: usize = current_line_index.*; | |
while (current_line_index.* < lines.len) { | |
const current_item_line_str = lines[current_line_index.*]; | |
const current_item_parsed_info = funcs.isListItemLineFn(current_item_line_str) orelse break; | |
if (current_item_parsed_info.indent < initial_list_indent) break; | |
const current_marker_is_ordered = (current_item_parsed_info.marker_type == '.' or current_item_parsed_info.marker_type == ')'); | |
const list_node_is_ordered = (list_node.node_type == .OrderedList); | |
if (current_marker_is_ordered != list_node_is_ordered) break; | |
if (list_node_is_ordered and current_item_parsed_info.actual_marker_char != list_node.list_item_marker and list_node.children.items.len > 0) { | |
// CommonMark: Ordered list delimiter can change for a new list, but not usually within the same list. | |
// This is a simplification; sublists can have different delimiters. | |
// If this is the *first* item, its marker sets the list_node marker. | |
// For subsequent items, if the delimiter changes, it might indicate a new list. | |
// break; // For stricter delimiter matching. | |
} | |
if (list_node.children.items.len > 0) { | |
var k = prev_item_end_line; | |
while (k < current_line_index.*) { | |
if (isBlankLine(lines[k])) { | |
list_node.tight_list = false; | |
break; | |
} | |
k += 1; | |
} | |
} | |
var item_node = try MdNode.create(allocator, .ListItem); | |
item_node.list_item_marker = current_item_parsed_info.actual_marker_char; | |
var item_content_lines_buf = LineBuffer.init(allocator); | |
defer item_content_lines_buf.deinit(); | |
try item_content_lines_buf.addLine(current_item_parsed_info.content_start); | |
const item_first_line_consumed_idx = current_line_index.*; | |
current_line_index.* += 1; | |
const marker_and_indent_len = (current_item_parsed_info.content_start.ptr - lines[item_first_line_consumed_idx].ptr); | |
var item_ended_with_blank_line_for_loose_check = false; | |
var item_contains_multiple_blocks_or_internal_blanks = false; | |
while (current_line_index.* < lines.len) { | |
const next_line_str = lines[current_line_index.*]; | |
if (funcs.isListItemLineFn(next_line_str)) |next_line_item_info| { | |
if (next_line_item_info.indent >= initial_list_indent) { | |
const next_is_ol = (next_line_item_info.marker_type == '.' or next_line_item_info.marker_type == ')'); | |
if (next_is_ol == list_node_is_ordered) { | |
if (!list_node_is_ordered or next_line_item_info.actual_marker_char == list_node.list_item_marker or list_node.children.items.len == 0) { | |
break; | |
} | |
// If OL and delimiter changed, could be a new list. | |
} | |
} | |
} | |
if (isBlankLine(next_line_str)) { | |
item_ended_with_blank_line_for_loose_check = true; // Mark blank line encountered | |
if (current_line_index.* + 1 < lines.len) { | |
const after_blank_line = lines[current_line_index.* + 1]; | |
const after_blank_indent = countLeadingSpaces(after_blank_line); | |
const next_next_is_item = (funcs.isListItemLineFn(after_blank_line) != null); | |
if (after_blank_indent >= marker_and_indent_len and !next_next_is_item) { | |
try item_content_lines_buf.addLine(""); | |
item_contains_multiple_blocks_or_internal_blanks = true; // Internal blank implies loose | |
} else { | |
break; | |
} | |
} else { break; } | |
} else { | |
const current_content_line_indent = countLeadingSpaces(next_line_str); | |
if (current_content_line_indent >= marker_and_indent_len) { | |
try item_content_lines_buf.addLine(next_line_str[marker_and_indent_len..]); | |
item_ended_with_blank_line_for_loose_check = false; // Content followed, reset | |
} else { | |
break; | |
} | |
} | |
current_line_index.* += 1; | |
} | |
var temp_item_idx: usize = 0; | |
_ = try funcs.parseBlocksFn(allocator, funcs, item_content_lines_buf.getLines(), &temp_item_idx, item_node); | |
if (item_node.children.items.len > 1) { | |
item_contains_multiple_blocks_or_internal_blanks = true; | |
} | |
// If an item ends with a blank line *that is part of its content* (i.e., followed by more indented content for the same item, or it's the last item and the blank is before EOF/next non-list block), it makes the list loose. | |
// My `item_ended_with_blank_line_for_loose_check` is true if the *last* line processed for the item's content was blank. | |
// This check, plus multiple blocks, determines looseness. | |
if (item_contains_multiple_blocks_or_internal_blanks) { // Stricter: or (item_ended_with_blank_line_for_loose_check and item_node.children.items.len > 0) | |
list_node.tight_list = false; | |
} | |
try list_node.addChild(item_node); | |
prev_item_end_line = current_line_index.*; | |
} | |
if (list_node.children.items.len == 0) { | |
list_node.deinit(); | |
return null; | |
} | |
return list_node; | |
} | |
fn parseHorizontalRuleImpl(line_trimmed: []const u8) ?*MdNode { | |
if (line_trimmed.len == 0) return null; | |
const char_c = line_trimmed[0]; | |
if (char_c != '*' and char_c != '-' and char_c != '_') return null; | |
var count: usize = 0; | |
for (line_trimmed) |c| { | |
if (c == char_c) { | |
count += 1; | |
} else if (!std.ascii.isSpace(c)) { | |
return null; | |
} | |
} | |
if (count < 3) return null; | |
// Use testing allocator for this one-off node as it's for checks mainly. | |
// The main parseBlocks will create its own with proper allocator. | |
return MdNode.create(std.testing.allocator, .HorizontalRule) catch null; | |
} | |
fn parseBlocksImpl(allocator: Allocator, funcs: ParserFuncs, lines: []const []const u8, current_line_index: *usize, parent_container: *MdNode) !*MdNode { | |
var paragraph_buffer = LineBuffer.init(allocator); | |
defer paragraph_buffer.deinit(); | |
while (current_line_index.* < lines.len) { | |
const line_orig = lines[current_line_index.*]; | |
const line_trimmed_leading = trimLeadingWhitespace(line_orig); | |
const initial_indent_spaces = line_trimmed_leading.ptr - line_orig.ptr; | |
if (paragraph_buffer.getLines().len > 0 and line_trimmed_leading.len > 0) { | |
var setext_level: u8 = 0; | |
const setext_char_test = line_trimmed_leading[0]; | |
if (setext_char_test == '=' or setext_char_test == '-') { | |
var is_all_marker = true; | |
for (line_trimmed_leading) |c| { | |
if (c != setext_char_test) {is_all_marker = false; break;} | |
} | |
if (is_all_marker) { | |
var is_hr_candidate = false; | |
if (setext_char_test == '-') { | |
if (funcs.parseHorizontalRuleFn(line_trimmed_leading)) |hr_node| { | |
is_hr_candidate = true; | |
hr_node.deinit(); | |
} | |
} | |
if (!is_hr_candidate or setext_char_test == '=') { | |
setext_level = if (setext_char_test == '=') 1 else 2; | |
} | |
} | |
} | |
if (setext_level > 0) { | |
var heading_node = try MdNode.create(allocator, .Heading); | |
heading_node.heading_level = setext_level; | |
var full_text_list = ArrayList(u8).init(allocator); | |
defer full_text_list.deinit(); | |
for (paragraph_buffer.getLines(), 0..) |para_line, i| { | |
try full_text_list.appendSlice(para_line); | |
if (i < paragraph_buffer.getLines().len - 1) try full_text_list.append('\n'); // CommonMark joins with newline for setext | |
} | |
try funcs.parseInlinesFn(allocator, funcs, heading_node, full_text_list.items); | |
try parent_container.addChild(heading_node); | |
for (paragraph_buffer.lines.items) |l| allocator.free(l); | |
paragraph_buffer.lines.clearRetainingCapacity(); | |
current_line_index.* += 1; | |
continue; | |
} | |
} | |
var line_starts_new_non_para_block = false; | |
if (line_trimmed_leading.len > 0) { | |
const first_char = line_trimmed_leading[0]; | |
if (first_char == '#') { | |
line_starts_new_non_para_block = true; | |
} else if (std.mem.startsWith(u8, line_trimmed_leading, "```") or std.mem.startsWith(u8, line_trimmed_leading, "~~~")) { | |
line_starts_new_non_para_block = true; | |
} else if (funcs.parseHorizontalRuleFn(line_trimmed_leading)) |hr_node| { | |
line_starts_new_non_para_block = true; hr_node.deinit(); | |
} else if (first_char == '>') { | |
line_starts_new_non_para_block = true; | |
} else if (funcs.isListItemLineFn(line_orig) != null) { | |
line_starts_new_non_para_block = true; | |
} | |
} | |
if (paragraph_buffer.getLines().len == 0 and initial_indent_spaces >= TAB_STOP_WIDTH and !isBlankLine(line_orig)) line_starts_new_non_para_block = true; | |
if (paragraph_buffer.getLines().len > 0 and (line_starts_new_non_para_block or isBlankLine(line_orig))) { | |
var para_node = try MdNode.create(allocator, .Paragraph); | |
for (paragraph_buffer.getLines(), 0..) |para_line, i| { | |
var current_para_line_slice = para_line; | |
var hard_break = false; | |
if (current_para_line_slice.len >= 2 and | |
current_para_line_slice[current_para_line_slice.len-1] == ' ' and | |
current_para_line_slice[current_para_line_slice.len-2] == ' ') { | |
hard_break = true; | |
current_para_line_slice = current_para_line_slice[0..current_para_line_slice.len-2]; | |
} | |
try funcs.parseInlinesFn(allocator, funcs, para_node, current_para_line_slice); | |
if (hard_break) { | |
try para_node.addChild(try MdNode.create(allocator, .HardBreak)); | |
} else if (i < paragraph_buffer.getLines().len - 1) { | |
try para_node.addChild(try MdNode.create(allocator, .SoftBreak)); | |
} | |
} | |
if (para_node.children.items.len > 0) try parent_container.addChild(para_node) | |
else para_node.deinit(); | |
for (paragraph_buffer.lines.items) |l| allocator.free(l); | |
paragraph_buffer.lines.clearRetainingCapacity(); | |
} | |
if (isBlankLine(line_orig)) { | |
current_line_index.* += 1; | |
continue; | |
} | |
var new_block_node: ?*MdNode = null; | |
if (line_trimmed_leading.len > 0) { | |
const first_char = line_trimmed_leading[0]; | |
if (first_char == '#') { | |
new_block_node = try parseHeadingAtx(allocator, funcs, line_trimmed_leading); | |
if (new_block_node != null) current_line_index.* += 1; | |
} else if (std.mem.startsWith(u8, line_trimmed_leading, "```") or std.mem.startsWith(u8, line_trimmed_leading, "~~~")) { | |
new_block_node = try parseFencedCodeBlock(allocator, funcs, lines, current_line_index); | |
} else if (funcs.parseHorizontalRuleFn(line_trimmed_leading)) |_| { | |
new_block_node = try MdNode.create(allocator, .HorizontalRule); // Create with proper allocator | |
current_line_index.* += 1; | |
} | |
if (new_block_node == null and first_char == '>') { | |
new_block_node = try parseBlockquote(allocator, funcs, lines, current_line_index); | |
} | |
} | |
if (new_block_node == null and funcs.isListItemLineFn(line_orig) != null) { | |
new_block_node = try parseList(allocator, funcs, lines, current_line_index, initial_indent_spaces); | |
} | |
if (new_block_node == null and initial_indent_spaces >= TAB_STOP_WIDTH and paragraph_buffer.getLines().len == 0) { | |
new_block_node = try parseCodeBlockIndented(allocator, funcs, lines, current_line_index); | |
} | |
if (new_block_node) |nbn| { | |
try parent_container.addChild(nbn); | |
continue; | |
} | |
// Don't trim trailing whitespace from paragraph lines yet (for hard breaks) | |
try paragraph_buffer.addLine(line_trimmed_leading); | |
current_line_index.* += 1; | |
} | |
if (paragraph_buffer.getLines().len > 0) { | |
var para_node = try MdNode.create(allocator, .Paragraph); | |
for (paragraph_buffer.getLines(), 0..) |para_line, i| { | |
var current_para_line_slice = para_line; | |
var hard_break = false; | |
if (current_para_line_slice.len >= 2 and | |
current_para_line_slice[current_para_line_slice.len-1] == ' ' and | |
current_para_line_slice[current_para_line_slice.len-2] == ' ') { | |
hard_break = true; | |
current_para_line_slice = current_para_line_slice[0..current_para_line_slice.len-2]; | |
} | |
try funcs.parseInlinesFn(allocator, funcs, para_node, current_para_line_slice); | |
if (hard_break) { | |
try para_node.addChild(try MdNode.create(allocator, .HardBreak)); | |
} else if (i < paragraph_buffer.getLines().len - 1) { | |
try para_node.addChild(try MdNode.create(allocator, .SoftBreak)); | |
} | |
} | |
if (para_node.children.items.len > 0) try parent_container.addChild(para_node) | |
else para_node.deinit(); | |
} | |
return parent_container; | |
} | |
fn parseDocumentFromLines(allocator: Allocator, funcs: ParserFuncs, lines_input: []const []const u8) !*MdNode { | |
const doc = try MdNode.create(allocator, .Document); | |
const current_line_idx: usize = 0; | |
_ = try funcs.parseBlocksFn(allocator, funcs, lines_input, current_line_idx, doc); | |
return doc; | |
} | |
pub fn parseMarkdown(allocator: Allocator, markdown_text: []const u8) !*MdNode { | |
const funcs = ParserFuncs{ | |
.parseInlinesRecursiveFn = parseInlinesRecursiveImpl, | |
.parseInlinesFn = parseInlinesImpl, | |
.parseBlocksFn = parseBlocksImpl, | |
.parseHorizontalRuleFn = parseHorizontalRuleImpl, | |
.isListItemLineFn = isListItemLineImpl, | |
}; | |
var line_buf_storage = LineBuffer.init(allocator); | |
defer line_buf_storage.deinit(); | |
var line_iterator = std.mem.splitScalar(u8, markdown_text, '\n'); | |
while (line_iterator.next()) |raw_line| { | |
var line_to_add = raw_line; | |
if (line_to_add.len > 0 and line_to_add[line_to_add.len-1] == '\r') { // Handle CRLF | |
line_to_add = line_to_add[0..line_to_add.len-1]; | |
} | |
const expanded_line_owned = expandTabs(allocator, line_to_add) catch { | |
std.debug.print("Warning: expandTabs failed for line: {s}\n", .{line_to_add}); | |
try line_buf_storage.addLine(line_to_add); | |
continue; | |
}; | |
defer allocator.free(expanded_line_owned); | |
try line_buf_storage.addLine(expanded_line_owned); | |
} | |
return parseDocumentFromLines(allocator, funcs, line_buf_storage.getLines()); | |
} | |
// --- AST Printer for Demo/Debug --- | |
fn printAstNode(node: *MdNode, indent_level: usize) void { | |
var i: usize = 0; | |
while (i < indent_level) : (i += 1) { print(" ", .{}); } | |
switch (node.node_type) { | |
.Document => print("DOCUMENT\n", .{}), | |
.Paragraph => print("PARAGRAPH\n", .{}), | |
.Heading => print("HEADING (Level {d})\n", .{node.heading_level}), | |
.Blockquote => print("BLOCKQUOTE\n", .{}), | |
.UnorderedList => print("UNORDERED_LIST (Marker: '{c}', Tight: {s})\n", .{node.list_item_marker, if (node.tight_list) "yes" else "no"}), | |
.OrderedList => print("ORDERED_LIST (Start: {d}, Marker: '{c}', Tight: {s})\n", .{node.list_start_number, node.list_item_marker, if (node.tight_list) "yes" else "no"}), | |
.ListItem => print("LIST_ITEM (Marker: '{c}')\n", .{node.list_item_marker}), | |
.CodeBlock => { | |
print("CODE_BLOCK (Indented)\n", .{}); | |
if (node.code_content) |cc| { | |
i = 0; while (i < indent_level + 1) : (i += 1) { print(" ", .{}); } | |
if (cc.len < 60) print("Content: [[{s}]]\n", .{cc}) | |
else print("Content: (long)\n", .{}); | |
} else { print ("Content: (null)\n", .{}); } | |
}, | |
.FencedCodeBlock => { | |
print("FENCED_CODE_BLOCK (Lang: {?s})\n", .{node.code_language}); | |
if (node.code_content) |cc| { | |
i = 0; while (i < indent_level + 1) : (i += 1) { print(" ", .{}); } | |
if (cc.len < 60) print("Content: [[{s}]]\n", .{cc}) | |
else print("Content: (long)\n", .{}); | |
} else { print ("Content: (null)\n", .{}); } | |
}, | |
.HorizontalRule => print("HORIZONTAL_RULE\n", .{}), | |
.HtmlBlock => print("HTML_BLOCK: [[{?s}]]\n", .{node.text_content}), | |
.Text => print("TEXT: \"{?s}\"\n", .{node.text_content}), | |
.Emphasis => print("EMPHASIS\n", .{}), | |
.Strong => print("STRONG\n", .{}), | |
.Strikethrough => print("STRIKETHROUGH\n", .{}), | |
.InlineCode => print("INLINE_CODE: `{?s}`\n", .{node.text_content}), | |
.Link => print("LINK (URL: {?s}, Title: {?s})\n", .{node.url, node.title}), | |
.Image => print("IMAGE (Alt: {?s}, URL: {?s}, Title: {?s})\n", .{node.alt_text, node.url, node.title}), | |
.SoftBreak => print("SOFT_BREAK\n", .{}), | |
.HardBreak => print("HARD_BREAK\n", .{}), | |
.HtmlInline => print("HTML_INLINE: {?s}\n", .{node.text_content}), | |
.EscapedChar => print("ESCAPED_CHAR: \\{?s}\n", .{node.text_content}), | |
} | |
for (node.children.items) |child_node| { | |
printAstNode(child_node, indent_level + 1); | |
} | |
} | |
pub fn main() !void { | |
var gpa = std.heap.GeneralPurposeAllocator(.{}){}; | |
defer _ = gpa.deinit(); | |
const allocator = gpa.allocator(); | |
const markdown_example = | |
\\# Welcome to Markdown | |
\\ | |
\\This is a paragraph with *italic* and **bold** text. | |
\\And a line with two spaces at the end for a hard break. | |
\\Next line. | |
\\ | |
\\Another paragraph with `inline code` and a [link](http://example.com "Optional Title"). | |
\\An image:  | |
\\ | |
\\> This is a blockquote. | |
\\> With multiple lines. | |
\\ | |
\\And a lazy continuation | |
\\for the blockquote. | |
\\ | |
\\## Sub Heading | |
\\ | |
\\Setext L1 | |
\\========= | |
\\ | |
\\Setext L2 | |
\\--------- | |
\\ | |
\\Indented code block: | |
\\ | |
\\ int main() { | |
\\ printf("Hello"); | |
\\ } | |
\\ | |
\\Fenced code block: | |
\\```c | |
\\void func() { | |
\\ // comment | |
\\} | |
\\``` | |
\\ | |
\\Unordered List (tight): | |
\\- Item 1 | |
\\ - Nested Item 1.1 | |
\\ - Nested Item 1.2 | |
\\- Item 2 | |
\\ * With more indent | |
\\ | |
\\Ordered List (tight): | |
\\1. First | |
\\2. Second | |
\\ 1) Nested first (marker change) | |
\\ 2) Nested second | |
\\3. Third | |
\\ | |
\\* Loose list item 1 | |
\\ | |
\\* Loose list item 2 | |
\\ This is content for item 2. | |
\\ | |
\\ Still item 2, after blank line. | |
\\ | |
\\* Loose list item 3 | |
\\ | |
\\Horizontal Rule: | |
\\--- | |
\\ | |
\\Escaped chars: \*hello\* \`code\` \[link\] | |
\\Inline HTML: <custom-tag attr="val">content</custom-tag> also <br/> this is text. | |
\\Autolink: <http://google.com> | |
\\Not an autolink: <[email protected]> (mailto: is required) | |
\\Autolink mail: <mailto:[email protected]> | |
; | |
print("Parsing Markdown:\n{s}\n", .{markdown_example}); | |
var doc = try parseMarkdown(allocator, markdown_example); | |
defer doc.deinit(); | |
print("\nAST Structure:\n", .{}); | |
printAstNode(doc, 0); | |
} | |
test "basic markdown parsing" { | |
var gpa = std.heap.GeneralPurposeAllocator(.{}){}; | |
defer _ = gpa.deinit(); | |
const allocator = gpa.allocator(); | |
const md_text = "# Hello\n\nThis is *fun*."; | |
var doc = try parseMarkdown(allocator, md_text); | |
defer doc.deinit(); | |
try std.testing.expectEqual(doc.node_type, MdNodeType.Document); | |
try std.testing.expectEqual(doc.children.items.len, 2); | |
const heading = doc.children.items[0]; | |
try std.testing.expectEqual(heading.node_type, MdNodeType.Heading); | |
try std.testing.expectEqual(heading.heading_level, 1); | |
try std.testing.expectEqual(heading.children.items.len, 1); | |
const heading_text = heading.children.items[0]; | |
try std.testing.expectEqual(heading_text.node_type, MdNodeType.Text); | |
try std.testing.expect(std.mem.eql(u8, heading_text.text_content.?, "Hello")); | |
const para = doc.children.items[1]; | |
try std.testing.expectEqual(para.node_type, MdNodeType.Paragraph); | |
try std.testing.expectEqual(para.children.items.len, 3); | |
const para_text1 = para.children.items[0]; | |
const para_emphasis = para.children.items[1]; | |
const para_text2 = para.children.items[2]; | |
try std.testing.expectEqual(para_text1.node_type, MdNodeType.Text); | |
try std.testing.expect(std.mem.eql(u8, para_text1.text_content.?, "This is ")); | |
try std.testing.expectEqual(para_emphasis.node_type, MdNodeType.Emphasis); | |
try std.testing.expectEqual(para_emphasis.children.items.len, 1); | |
const emphasis_text = para_emphasis.children.items[0]; | |
try std.testing.expectEqual(emphasis_text.node_type, MdNodeType.Text); | |
try std.testing.expect(std.mem.eql(u8, emphasis_text.text_content.?, "fun")); | |
try std.testing.expectEqual(para_text2.node_type, MdNodeType.Text); | |
try std.testing.expect(std.mem.eql(u8, para_text2.text_content.?, ".")); | |
} | |
test "fenced code block lang and content" { | |
var gpa = std.heap.GeneralPurposeAllocator(.{}){}; | |
defer _ = gpa.deinit(); | |
const allocator = gpa.allocator(); | |
const md = "```zig\nconst x = 10;\n```"; | |
var doc = try parseMarkdown(allocator, md); | |
defer doc.deinit(); | |
try std.testing.expectEqual(doc.children.items.len, 1); | |
const fcb = doc.children.items[0]; | |
try std.testing.expectEqual(fcb.node_type, .FencedCodeBlock); | |
try std.testing.expect(std.mem.eql(u8, fcb.code_language.?, "zig")); | |
try std.testing.expect(std.mem.eql(u8, fcb.code_content.?, "const x = 10;")); | |
const md_empty = "```\n\n```"; | |
var doc_empty = try parseMarkdown(allocator, md_empty); | |
defer doc_empty.deinit(); | |
const fcb_empty = doc_empty.children.items[0]; | |
try std.testing.expectEqual(fcb_empty.node_type, .FencedCodeBlock); | |
try std.testing.expect(fcb_empty.code_language == null); | |
try std.testing.expect(std.mem.eql(u8, fcb_empty.code_content.?, "")); | |
const md_no_final_newline = "```\ntest```"; | |
var doc_no_finalnl = try parseMarkdown(allocator, md_no_final_newline); | |
defer doc_no_finalnl.deinit(); | |
const fcb_no_finalnl = doc_no_finalnl.children.items[0]; | |
try std.testing.expect(std.mem.eql(u8, fcb_no_finalnl.code_content.?, "test")); | |
} | |
test "list parsing tight vs loose" { | |
var gpa = std.heap.GeneralPurposeAllocator(.{}){}; | |
defer _ = gpa.deinit(); | |
const allocator = gpa.allocator(); | |
const md_tight = "- item1\n- item2"; | |
var doc_tight = try parseMarkdown(allocator, md_tight); | |
defer doc_tight.deinit(); | |
try std.testing.expectEqual(doc_tight.children.items.len, 1); | |
const list_tight = doc_tight.children.items[0]; | |
try std.testing.expectEqual(list_tight.node_type, .UnorderedList); | |
try std.testing.expect(list_tight.tight_list); | |
const md_loose_blank_between = "- item1\n\n- item2"; | |
var doc_loose_bb = try parseMarkdown(allocator, md_loose_blank_between); | |
defer doc_loose_bb.deinit(); | |
try std.testing.expectEqual(doc_loose_bb.children.items.len, 1); | |
const list_loose_bb = doc_loose_bb.children.items[0]; | |
try std.testing.expectEqual(list_loose_bb.node_type, .UnorderedList); | |
try std.testing.expect(!list_loose_bb.tight_list); | |
const md_loose_internal_blank = "- item1\n para1\n\n para2\n- item2"; // Blank line inside item content | |
var doc_loose_ib = try parseMarkdown(allocator, md_loose_internal_blank); | |
defer doc_loose_ib.deinit(); | |
const list_loose_ib = doc_loose_ib.children.items[0]; | |
try std.testing.expectEqual(list_loose_ib.node_type, .UnorderedList); | |
try std.testing.expect(!list_loose_ib.tight_list); | |
try std.testing.expectEqual(list_loose_ib.children.items[0].children.items.len, 2); | |
const md_loose_ends_blank = "* item1\n\n"; // List item ends with blank line that's part of it | |
var doc_loose_eb = try parseMarkdown(allocator, md_loose_ends_blank); | |
defer doc_loose_eb.deinit(); | |
const list_loose_eb = doc_loose_eb.children.items[0]; | |
try std.testing.expectEqual(list_loose_eb.node_type, .UnorderedList); | |
// This case is subtle. CommonMark says: "A list is loose if any of its constituent list items are separated by blank lines, | |
// or if any of its constituent list items directly contain two block-level elements with a blank line between them." | |
// A single item ending in a blank line, if that blank line is *part of the item's content* (e.g. before an indented block), makes it loose. | |
// If it's just a blank line *after* the item, it doesn't. My parser might be too aggressive here. | |
// The test case `* item1\n\n* item2` correctly becomes loose due to blank between items. | |
// `* item1\n\n Indented block` also makes item1 loose, and thus the list. | |
// `* item1\n\nParagraph after list` - item1 is tight. | |
// Current logic might mark `md_loose_ends_blank` as loose, which might be okay or too strict depending on interpretation. | |
// For now, let's assume if an item *contains* a blank line that separates blocks or is kept as part of it, it's loose. | |
// The key is if the blank line is *consumed* by the item vs separating it from the next item/block. | |
try std.testing.expect(!list_loose_eb.tight_list); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This is an implementation from scratch | |
const std = @import("std"); | |
const Allocator = std.mem.Allocator; | |
const ArrayList = std.ArrayList; | |
const AutoHashMap = std.AutoHashMap; | |
const StringHashMap = std.StringHashMap; | |
const util = std.util; | |
const ascii = std.ascii; | |
const mem = std.mem; | |
const DEBUG = false; // Set to true for debug prints | |
fn dbgPrint(comptime fmt: []const u8, args: anytype) void { | |
if (DEBUG) { | |
std.debug.print(fmt, args); | |
} | |
} | |
const NodeType = enum { | |
Document, | |
Paragraph, | |
Heading, | |
Blockquote, | |
UnorderedList, | |
OrderedList, | |
ListItem, | |
CodeBlock, // Indented | |
FencedCodeBlock, | |
HorizontalRule, | |
HtmlBlock, | |
Text, | |
Emphasis, | |
Strong, | |
Strikethrough, | |
Link, | |
Image, | |
InlineCode, | |
RawHtml, // Inline HTML | |
LineBreak, // Hard line break | |
// LinkReferenceDefinition, // Not a visible node, stored in parser | |
}; | |
const LinkRef = struct { | |
url: []const u8, | |
title: ?[]const u8, | |
}; | |
const Node = struct { | |
type: NodeType, | |
children: ArrayList(Node), | |
content: ?[]const u8, | |
level: ?u8, | |
url: ?[]const u8, | |
title: ?[]const u8, | |
alt_text: ?[]const u8, | |
info_string: ?[]const u8, | |
start_number: ?u64, | |
tight: ?bool, // For lists | |
is_task_list_item: ?bool, | |
is_task_list_item_checked: ?bool, | |
allocator: Allocator, | |
pub fn init(allocator: Allocator, node_type: NodeType) Node { | |
return .{ | |
.type = node_type, | |
.children = ArrayList(Node).init(allocator), | |
.content = null, | |
.level = null, | |
.url = null, | |
.title = null, | |
.alt_text = null, | |
.info_string = null, | |
.start_number = null, | |
.tight = null, | |
.is_task_list_item = null, | |
.is_task_list_item_checked = null, | |
.allocator = allocator, | |
}; | |
} | |
pub fn deinit(self: *Node) void { | |
for (self.children.items) |*child| { | |
child.deinit(); | |
} | |
self.children.deinit(); | |
if (self.content) |c| self.allocator.free(c); | |
if (self.url) |u| self.allocator.free(u); | |
if (self.title) |t| self.allocator.free(t); | |
if (self.alt_text) |a| self.allocator.free(a); | |
if (self.info_string) |i| self.allocator.free(i); | |
} | |
pub fn appendChild(self: *Node, child: Node) !void { | |
try self.children.append(child); | |
} | |
pub fn print(self: Node, indent_level: usize) void { | |
var i: usize = 0; | |
while (i < indent_level) : (i += 1) { | |
std.debug.print(" ", .{}); | |
} | |
std.debug.print("{any}", .{self.type}); | |
if (self.content) |c| std.debug.print(" \"{s}\"", .{c}); | |
if (self.level) |l| std.debug.print(" (L{d})", .{l}); | |
if (self.url) |u| std.debug.print(" (url: \"{s}\")", .{u}); | |
if (self.title) |t| std.debug.print(" (title: \"{s}\")", .{t}); | |
if (self.alt_text) |a| std.debug.print(" (alt: \"{s}\")", .{a}); | |
if (self.info_string) |is| std.debug.print(" (info: \"{s}\")", .{is}); | |
if (self.start_number) |s| std.debug.print(" (start: {d})", .{s}); | |
if (self.tight) |t| std.debug.print(" (tight: {any})", .{t}); | |
if (self.is_task_list_item) |is_task| { | |
if (is_task) { | |
std.debug.print(" (task: {any})", .{self.is_task_list_item_checked}); | |
} | |
} | |
std.debug.print("\n", .{}); | |
for (self.children.items) |child| { | |
child.print(indent_level + 1); | |
} | |
} | |
}; | |
fn countLeadingChars(slice: []const u8, char_to_count: u8) usize { | |
var count: usize = 0; | |
while (count < slice.len and slice[count] == char_to_count) : (count += 1) {} | |
return count; | |
} | |
fn trimAll(slice: []const u8, comptime_chars: []const u8) []const u8 { | |
return mem.trim(u8, slice, comptime_chars); | |
} | |
fn trimLine(line: []const u8) []const u8 { | |
return mem.trim(u8, line, " \t\r\n"); | |
} | |
fn getIndentWidth(line: []const u8, tab_stop: usize) usize { | |
var width: usize = 0; | |
var i: usize = 0; | |
while (i < line.len) { | |
if (line[i] == ' ') { | |
width += 1; | |
} else if (line[i] == '\t') { | |
width += tab_stop - (width % tab_stop); | |
} else { | |
break; | |
} | |
i += 1; | |
} | |
return width; | |
} | |
fn stripIndent(line: []const u8, indent_to_strip: usize, tab_stop: usize) []const u8 { | |
var current_indent_stripped: usize = 0; | |
var i: usize = 0; | |
while (i < line.len and current_indent_stripped < indent_to_strip) { | |
if (line[i] == ' ') { | |
current_indent_stripped += 1; | |
i += 1; | |
} else if (line[i] == '\t') { | |
const spaces_for_tab = tab_stop - (current_indent_stripped % tab_stop); | |
current_indent_stripped += spaces_for_tab; | |
i += 1; | |
if (current_indent_stripped > indent_to_strip) { | |
// Stripped part of a tab that went beyond indent_to_strip | |
// This means we need to prepend spaces for the over-stripped part. | |
// This scenario is complex and usually avoided by ensuring indent_to_strip is a multiple of tab_stop or precise. | |
// For simplicity, we assume this doesn't happen often or handled by caller carefully. | |
// The common case is stripping N spaces. | |
// A better way is to convert leading tabs to spaces first. | |
// For now, this is a simplification. | |
break; | |
} | |
} else { | |
break; | |
} | |
} | |
return line[i..]; | |
} | |
fn isBlankLine(line: []const u8) bool { | |
return trimLine(line).len == 0; | |
} | |
const TAB_STOP: usize = 4; | |
const Parser = struct { | |
allocator: Allocator, | |
input_lines: ArrayList([]const u8), | |
current_line_idx: usize, | |
link_references: StringHashMap(LinkRef), | |
recursion_depth: usize, // To prevent deep recursion in list/blockquote parsing | |
const MAX_RECURSION_DEPTH = 64; // Arbitrary limit | |
pub fn init(allocator: Allocator, markdown_text: []const u8) !Parser { | |
var p = Parser{ | |
.allocator = allocator, | |
.input_lines = ArrayList([]const u8).init(allocator), | |
.current_line_idx = 0, | |
.link_references = StringHashMap(LinkRef).init(allocator), | |
.recursion_depth = 0, | |
}; | |
try p.preprocessAndSplitLines(markdown_text); | |
return p; | |
} | |
pub fn deinit(self: *Parser) void { | |
for (self.input_lines.items) |line| { | |
// Lines are slices of original input or allocated during preprocessing. | |
// If allocated, they need to be freed. Assuming for now they are slices of a single block. | |
// If preprocessAndSplitLines allocates, it needs to be tracked. | |
// For now, assume lines are from a single alloc that Parser will free, or slices of original input. | |
// Let's make preprocessAndSplitLines allocate a single buffer and lines slice into it. | |
// OR input_lines owns its strings if they were modified. | |
// Easiest for now: if preprocessing creates new strings, it dupe()s them. | |
// The current `preprocessAndSplitLines` creates owned lines. | |
self.allocator.free(line); | |
} | |
self.input_lines.deinit(); | |
var iter = self.link_references.valueIterator(); | |
while (iter.next()) |link_ref| { | |
self.allocator.free(link_ref.url); | |
if (link_ref.title) |t| self.allocator.free(t); | |
} | |
self.link_references.deinit(); | |
} | |
fn preprocessAndSplitLines(self: *Parser, original_input: []const u8) !void { | |
var line_start: usize = 0; | |
var i: usize = 0; | |
var temp_line_buf = ArrayList(u8).init(self.allocator); | |
defer temp_line_buf.deinit(); | |
while (i < original_input.len) { | |
const char = original_input[i]; | |
if (char == '\r') { | |
if (i + 1 < original_input.len and original_input[i + 1] == '\n') { | |
// CR LF | |
try temp_line_buf.appendSlice(original_input[line_start..i]); | |
try temp_line_buf.append('\n'); | |
try self.input_lines.append(try temp_line_buf.toOwnedSlice()); | |
temp_line_buf.clearRetainingCapacity(); | |
i += 1; // consume LF | |
line_start = i + 1; | |
} else { | |
// CR | |
try temp_line_buf.appendSlice(original_input[line_start..i]); | |
try temp_line_buf.append('\n'); | |
try self.input_lines.append(try temp_line_buf.toOwnedSlice()); | |
temp_line_buf.clearRetainingCapacity(); | |
line_start = i + 1; | |
} | |
} else if (char == '\n') { | |
// LF | |
try temp_line_buf.appendSlice(original_input[line_start .. i + 1]); | |
try self.input_lines.append(try temp_line_buf.toOwnedSlice()); | |
temp_line_buf.clearRetainingCapacity(); | |
line_start = i + 1; | |
} | |
i += 1; | |
} | |
if (line_start < original_input.len) { | |
try temp_line_buf.appendSlice(original_input[line_start..]); | |
try self.input_lines.append(try temp_line_buf.toOwnedSlice()); // No final newline if not present | |
temp_line_buf.clearRetainingCapacity(); | |
} | |
} | |
fn currentLine(self: *const Parser) ?[]const u8 { | |
if (self.current_line_idx >= self.input_lines.items.len) return null; | |
return self.input_lines.items[self.current_line_idx]; | |
} | |
fn peekLine(self: *const Parser, offset: usize) ?[]const u8 { | |
if (self.current_line_idx + offset >= self.input_lines.items.len) return null; | |
return self.input_lines.items[self.current_line_idx + offset]; | |
} | |
fn advanceLine(self: *Parser) void { | |
if (self.current_line_idx < self.input_lines.items.len) { | |
self.current_line_idx += 1; | |
} | |
} | |
fn consumeCurrentLine(self: *Parser) ?[]const u8 { | |
const line = self.currentLine(); | |
self.advanceLine(); | |
return line; | |
} | |
fn isEof(self: *const Parser) bool { | |
return self.current_line_idx >= self.input_lines.items.len; | |
} | |
fn parseDocument(self: *Parser) !Node { | |
var document = Node.init(self.allocator, .Document); | |
errdefer document.deinit(); | |
while (!self.isEof()) { | |
const line = self.currentLine().?; | |
const indent = getIndentWidth(line, TAB_STOP); | |
const content_line = stripIndent(line, indent, TAB_STOP); | |
if (self.tryParseBlankLines(&document)) continue; | |
if (try self.tryParseLinkReferenceDefinition(&document)) continue; | |
if (try self.tryParseThematicBreak(&document)) continue; | |
if (try self.tryParseAtxHeading(&document)) continue; | |
if (try self.tryParseFencedCodeBlock(&document)) continue; | |
if (try self.tryParseHtmlBlock(&document)) continue; // TODO | |
if (try self.tryParseBlockquote(&document)) continue; | |
if (try self.tryParseUnorderedList(&document, indent)) continue; | |
if (try self.tryParseOrderedList(&document, indent)) continue; | |
// Indented code block must not interrupt a paragraph. | |
// If the last block was not a paragraph, or if we are at the top level (no last block), | |
// an indented line could be an indented code block. | |
var last_block_was_paragraph = false; | |
if (document.children.items.len > 0) { | |
if (document.children.items[document.children.items.len - 1].type == .Paragraph) { | |
last_block_was_paragraph = true; | |
} | |
} | |
if (indent >= TAB_STOP and !isBlankLine(line) and !last_block_was_paragraph) { | |
if (try self.tryParseIndentedCodeBlock(&document)) continue; | |
} | |
// Paragraphs and Setext Headings | |
// This is the fallback. | |
if (try self.tryParseParagraphOrSetext(&document)) continue; | |
// If nothing matched (should be rare, paragraph is a good fallback) | |
// This could happen if a line is e.g. " foo" and not caught by indented code block logic | |
// or if it's some construct not yet supported that doesn't form a paragraph start. | |
// For safety, consume the line to avoid infinite loop. | |
dbgPrint("Warning: Unparsed line: {s}\n", .{line}); | |
self.advanceLine(); | |
} | |
return document; | |
} | |
// --- Block Parsers --- | |
fn tryParseBlankLines(self: *Parser, parent_node: *Node) bool { | |
_ = parent_node; // Not used yet, might be for tight list determination | |
var consumed = false; | |
while (self.currentLine()) |line| { | |
if (isBlankLine(line)) { | |
self.advanceLine(); | |
consumed = true; | |
} else { | |
break; | |
} | |
} | |
return consumed; | |
} | |
fn tryParseLinkReferenceDefinition(self: *Parser, _: *Node) !bool { | |
const line_start_idx = self.current_line_idx; | |
var line = self.currentLine() orelse return false; | |
var i = getIndentWidth(line, TAB_STOP); | |
if (i >= TAB_STOP) return false; // Max 3 spaces indent | |
const line_after_indent = stripIndent(line, i, TAB_STOP); | |
if (line_after_indent.len == 0 or line_after_indent[0] != '[') return false; | |
var end_label_idx: ?usize = null; | |
var balance: usize = 0; | |
for (line_after_indent, 0..) |char, idx| { | |
if (char == '[') balance += 1; | |
else if (char == ']') { | |
balance -= 1; | |
if (balance == 0) { | |
end_label_idx = idx; | |
break; | |
} | |
} else if (char == '\n' or char == '\r') { // Label cannot span lines | |
return false; | |
} | |
} | |
if (end_label_idx == null) return false; | |
const label_end = end_label_idx.? ; | |
if (label_end == 0) return false; // Empty label: `[]:` | |
const label_raw = line_after_indent[1..label_end]; | |
if (isBlankLine(label_raw)) return false; // Label consists of only whitespace | |
const label = try self.allocator.dupe(u8, mem.trim(u8, label_raw, " \t\r\n")); // Normalize label | |
defer if(label.len == 0) self.allocator.free(label); // If label becomes empty after trim | |
if (label.len == 0) return false; | |
i = label_end + 1; | |
if (i >= line_after_indent.len or line_after_indent[i] != ':') { | |
self.allocator.free(label); | |
return false; | |
} | |
i += 1; // Skip ':' | |
// Skip optional whitespace before URL | |
while (i < line_after_indent.len and ascii.isWhitespace(line_after_indent[i]) and line_after_indent[i] != '\n') : (i += 1) {} | |
if (i >= line_after_indent.len or line_after_indent[i] == '\n') { // No URL | |
self.allocator.free(label); | |
return false; | |
} | |
var url_start = i; | |
var url_end = i; | |
var dest: []const u8 = ""; | |
var title_str: ?[]const u8 = null; | |
if (line_after_indent[url_start] == '<') { // URL in <> | |
url_start += 1; | |
var url_balance = 1; | |
url_end = url_start; | |
while (url_end < line_after_indent.len) { | |
if (line_after_indent[url_end] == '<') url_balance += 1; | |
else if (line_after_indent[url_end] == '>') { | |
url_balance -= 1; | |
if (url_balance == 0) break; | |
} else if (line_after_indent[url_end] == '\n' or line_after_indent[url_end] == '\\') { // Unescaped newline or backslash in <url> not allowed | |
self.allocator.free(label); | |
return false; | |
} | |
url_end += 1; | |
} | |
if (url_balance != 0) { self.allocator.free(label); return false; } // Mismatched <> | |
dest = line_after_indent[url_start..url_end]; | |
url_end += 1; // past '>' | |
} else { // URL not in <> | |
url_end = url_start; | |
var paren_balance = 0; | |
while (url_end < line_after_indent.len) { | |
const c = line_after_indent[url_end]; | |
if (ascii.isWhitespace(c)) break; | |
if (c == '(') paren_balance += 1; | |
else if (c == ')') { | |
if (paren_balance == 0) break; // CommonMark: unescaped ) not allowed if not balanced. This is simpler. | |
paren_balance -= 1; | |
} else if (c == '\\' and url_end + 1 < line_after_indent.len) { // Escaped char | |
url_end += 1; | |
} | |
url_end += 1; | |
} | |
if (url_start == url_end) { self.allocator.free(label); return false; } // Empty URL | |
dest = line_after_indent[url_start..url_end]; | |
} | |
// TODO: Unescape URL: dest = unescape_string(dest) | |
i = url_end; | |
const ws_before_title_start = i; | |
while (i < line_after_indent.len and ascii.isWhitespace(line_after_indent[i]) and line_after_indent[i] != '\n') : (i += 1) {} | |
const ws_before_title_end = i; | |
if (i < line_after_indent.len and line_after_indent[i] != '\n') { // Potential title | |
const title_char = line_after_indent[i]; | |
if (title_char == '"' or title_char == '\'' or title_char == '(') { | |
const closing_char = switch (title_char) { | |
'"' => '"', | |
'\'' => '\'', | |
'(' => ')', | |
else => unreachable, | |
}; | |
const title_content_start = i + 1; | |
var title_content_end = title_content_start; | |
var found_closing = false; | |
while(title_content_end < line_after_indent.len) { | |
if (line_after_indent[title_content_end] == '\\' and title_content_end + 1 < line_after_indent.len) { | |
title_content_end += 2; // Skip escaped char | |
continue; | |
} | |
if (line_after_indent[title_content_end] == closing_char) { | |
found_closing = true; | |
break; | |
} | |
if (line_after_indent[title_content_end] == '\n') break; // Title cannot span lines implicitly | |
title_content_end += 1; | |
} | |
if (found_closing) { | |
// Check if anything else on the line after title | |
var j = title_content_end + 1; | |
while(j < line_after_indent.len and ascii.isWhitespace(line_after_indent[j]) and line_after_indent[j] != '\n') : (j+=1) {} | |
if (j < line_after_indent.len and line_after_indent[j] != '\n') { // Junk after title | |
// This is not a valid title, so the URL part extends to ws_before_title_start or ws_before_title_end | |
// For simplicity, we assume this means no title. | |
} else { | |
// Valid title found | |
const raw_title = line_after_indent[title_content_start..title_content_end]; | |
// TODO: unescape title: title_str = unescape_string(raw_title) | |
title_str = try self.allocator.dupe(u8, raw_title); | |
i = title_content_end + 1; // Advance past closing quote/paren | |
} | |
} | |
} | |
// If no valid title was parsed, the characters might be part of a multi-line URL or just junk. | |
// CommonMark allows URL and title to span multiple lines if subsequent lines are blank or less indented. | |
// This simplified parser only considers single-line definitions. | |
} | |
// Check for junk after definition | |
while (i < line_after_indent.len and ascii.isWhitespace(line_after_indent[i]) and line_after_indent[i] != '\n') : (i += 1) {} | |
if (i < line_after_indent.len and line_after_indent[i] != '\n') { // Junk after definition | |
self.allocator.free(label); | |
if (title_str) |ts| self.allocator.free(ts); | |
return false; | |
} | |
// Successfully parsed a link reference definition | |
const final_url = try self.allocator.dupe(u8, dest); | |
const link_ref_val = LinkRef{ .url = final_url, .title = title_str }; | |
// Normalize label for map key: lowercase, collapse internal whitespace to single space | |
// For now, use the trimmed label directly. Proper normalization is complex. | |
const normalized_key_label = try self.normalizeLabelForMap(label); // This also allocates | |
self.allocator.free(label); // Free the original dupe'd label | |
// Only add if not already present (first one wins) | |
if (!self.link_references.contains(normalized_key_label)) { | |
try self.link_references.put(normalized_key_label, link_ref_val); | |
} else { | |
// Duplicate label, ignore this one. Free resources. | |
self.allocator.free(normalized_key_label); | |
self.allocator.free(final_url); | |
if (title_str) |ts| self.allocator.free(ts); | |
} | |
self.current_line_idx = line_start_idx; // Reset for advanceLine | |
self.advanceLine(); | |
return true; | |
} | |
fn normalizeLabelForMap(self: *Parser, label: []const u8) ![]const u8 { | |
// Simple normalization: lowercase and trim. | |
// True CommonMark normalization: collapse whitespace sequences to single space. | |
var buf = ArrayList(u8).init(self.allocator); | |
defer if(buf.items.len == 0) buf.deinit() else {}; // deinit if toOwnedSlice fails or not used | |
var last_was_space = true; // To collapse multiple spaces | |
for (label) |c| { | |
if (ascii.isWhitespace(c)) { | |
if (!last_was_space) { | |
try buf.append(' '); | |
last_was_space = true; | |
} | |
} else { | |
try buf.append(ascii.toLower(c)); | |
last_was_space = false; | |
} | |
} | |
// Remove trailing space if any | |
if (buf.items.len > 0 and buf.items[buf.items.len - 1] == ' ') { | |
_ = buf.pop(); | |
} | |
return buf.toOwnedSlice(); | |
} | |
fn tryParseThematicBreak(self: *Parser, parent_node: *Node) !bool { | |
var line = self.currentLine() orelse return false; | |
const indent = getIndentWidth(line, TAB_STOP); | |
if (indent >= TAB_STOP) return false; | |
const content = stripIndent(line, indent, TAB_STOP); | |
var marker_char: u8 = 0; | |
var count: usize = 0; | |
var non_marker_found = false; | |
for (content) |char| { | |
if (char == '*' or char == '-' or char == '_') { | |
if (marker_char == 0) { | |
marker_char = char; | |
} else if (char != marker_char) { | |
non_marker_found = true; | |
break; | |
} | |
count += 1; | |
} else if (char == ' ' or char == '\t') { | |
// spaces are allowed | |
} else if (char == '\n' or char == '\r') { | |
// end of line | |
break; | |
} else { | |
non_marker_found = true; | |
break; | |
} | |
} | |
if (!non_marker_found and count >= 3) { | |
var hr_node = Node.init(self.allocator, .HorizontalRule); | |
errdefer hr_node.deinit(); | |
try parent_node.appendChild(hr_node); | |
self.advanceLine(); | |
return true; | |
} | |
return false; | |
} | |
fn tryParseAtxHeading(self: *Parser, parent_node: *Node) !bool { | |
var line = self.currentLine() orelse return false; | |
const indent = getIndentWidth(line, TAB_STOP); | |
if (indent >= TAB_STOP) return false; | |
const content = stripIndent(line, indent, TAB_STOP); | |
if (content.len == 0 or content[0] != '#') return false; | |
var level: usize = 0; | |
while (level < content.len and content[level] == '#') : (level += 1) {} | |
if (level == 0 or level > 6) return false; | |
var text_start = level; | |
// There must be a space after #s, unless line is only #s | |
if (text_start < content.len and content[text_start] != ' ' and content[text_start] != '\t' and content[text_start] != '\n' and content[text_start] != '\r') { | |
// Exception: "###foo" is not a heading but "### foo" is. However "###" is. | |
// If there's content after hashes, it must be preceded by space. | |
// If hashes are followed immediately by non-whitespace, it's not a heading, unless it's EOL. | |
var all_hashes = true; | |
for(content[text_start..]) |c| { | |
if (c != ' ' and c != '\t' and c != '\n' and c != '\r') { | |
all_hashes = false; | |
break; | |
} | |
} | |
if (!all_hashes) return false; | |
} | |
while (text_start < content.len and (content[text_start] == ' ' or content[text_start] == '\t')) : (text_start += 1) {} | |
var text_end = content.len; | |
// Trim trailing hashes if they are preceded by space | |
// e.g. "## foo ##" -> "foo" | |
// "## foo#bar ##" -> "foo#bar" (inner # is part of content) | |
var temp_end = content.len; | |
while (temp_end > text_start) { // Go from right to left | |
const c = content[temp_end - 1]; | |
if (c == ' ' or c == '\t' or c == '\n' or c == '\r') { | |
temp_end -= 1; | |
} else if (c == '#') { | |
// Check if all remaining chars are '#' or space before it | |
var can_strip_trailing_hashes = true; | |
var space_before_hashes = false; | |
var k = text_start; | |
var first_hash_group_end = text_start; // end of leading hashes | |
while(k < temp_end and content[k] == '#') : (k+=1); | |
first_hash_group_end = k; | |
while(k < temp_end and (content[k] == ' ' or content[k] == '\t')) : (k+=1); // space after leading hashes | |
var actual_text_end = temp_end; | |
var m = temp_end -1; | |
while (m >= k && content[m] == '#') : (m -=1); | |
// Check if char before this sequence of trailing # is a space | |
if (m >= k && (content[m] == ' ' or content[m] == '\t')) { | |
// Potential valid trailing hashes | |
text_end = m; // Point before the space before trailing hashes | |
while (text_end > text_start && (content[text_end-1] == ' ' or content[text_end-1] == '\t')) { | |
text_end -=1; // Trim spaces before the stripped hashes | |
} | |
} else { | |
// No space before hashes, or hashes are adjacent to text, so not strippable | |
} | |
break; // Processed trailing hashes once | |
} else { | |
break; // Not a space or hash, so content ends here | |
} | |
} | |
// Final trim of any remaining trailing spaces from content | |
while (text_end > text_start and (content[text_end - 1] == ' ' or content[text_end - 1] == '\t' or content[text_end-1] == '\n' or content[text_end-1] == '\r')) { | |
text_end -= 1; | |
} | |
const heading_text_slice = content[text_start..text_end]; | |
var heading_node = Node.init(self.allocator, .Heading); | |
errdefer heading_node.deinit(); | |
heading_node.level = @intCast(u8, level); | |
try self.parseInlines(heading_text_slice, &heading_node, . {}); | |
try parent_node.appendChild(heading_node); | |
self.advanceLine(); | |
return true; | |
} | |
fn tryParseFencedCodeBlock(self: *Parser, parent_node: *Node) !bool { | |
var line = self.currentLine() orelse return false; | |
const initial_line_idx = self.current_line_idx; | |
const indent = getIndentWidth(line, TAB_STOP); | |
if (indent >= TAB_STOP) return false; | |
const content_after_indent = stripIndent(line, indent, TAB_STOP); | |
if (content_after_indent.len < 3) return false; | |
const fence_char = content_after_indent[0]; | |
if (fence_char != '`' and fence_char != '~') return false; | |
var fence_len: usize = 0; | |
while (fence_len < content_after_indent.len and content_after_indent[fence_len] == fence_char) : (fence_len += 1) {} | |
if (fence_len < 3) return false; | |
// Info string | |
const info_string_raw = mem.trim(u8, content_after_indent[fence_len..], " \t\r\n"); | |
// Info string cannot contain backticks if fence is backtick | |
if (fence_char == '`' and mem.indexOfScalar(u8, info_string_raw, '`') != null) { | |
return false; | |
} | |
const info_string = if (info_string_raw.len > 0) try self.allocator.dupe(u8, info_string_raw) else null; | |
errdefer if (info_string) |is| self.allocator.free(is); | |
self.advanceLine(); // Consume opening fence line | |
var code_lines = ArrayList([]const u8).init(self.allocator); | |
defer { | |
for (code_lines.items) |cl| self.allocator.free(cl); // if dupe'd | |
code_lines.deinit(); | |
} | |
while (self.currentLine()) |current_content_line| { | |
const current_content_indent = getIndentWidth(current_content_line, TAB_STOP); | |
const line_after_closing_indent = stripIndent(current_content_line, current_content_indent, TAB_STOP); | |
var is_closing_fence = true; | |
if (current_content_indent < indent) { // Closing fence can be less indented up to original line indent | |
// This rule is complex. Simpler: closing fence must match or exceed opening indent, | |
// and then be stripped relative to that. | |
// CommonMark: "The closing code fence must be at least as long as the opening fence" | |
// "and its indent must be less than 4 spaces" (relative to what? the line itself) | |
// "It need not be flush left or indented the same number of spaces as the opening fence." | |
// The crucial part is `stripIndent(current_content_line, current_content_indent, TAB_STOP)` handles this. | |
// The indent of the closing fence line itself must be < TAB_STOP. | |
// But the content of it must match the fence. | |
if (getIndentWidth(current_content_line, TAB_STOP) >= TAB_STOP) { | |
is_closing_fence = false; | |
} | |
} | |
if (is_closing_fence and line_after_closing_indent.len >= fence_len and line_after_closing_indent[0] == fence_char) { | |
var closing_fence_len: usize = 0; | |
while (closing_fence_len < line_after_closing_indent.len and line_after_closing_indent[closing_fence_len] == fence_char) : (closing_fence_len += 1) {} | |
if (closing_fence_len >= fence_len) { | |
// Check that rest of line is whitespace | |
const after_closing_fence = mem.trim(u8, line_after_closing_indent[closing_fence_len..], " \t\r\n"); | |
if (after_closing_fence.len == 0) { | |
self.advanceLine(); // Consume closing fence | |
// Construct the node | |
var code_block_node = Node.init(self.allocator, .FencedCodeBlock); | |
errdefer code_block_node.deinit(); | |
code_block_node.info_string = info_string; // transfer ownership | |
var full_code_content = ArrayList(u8).init(self.allocator); | |
defer full_code_content.deinit(); | |
for (code_lines.items, 0..) |code_line, line_idx| { | |
try full_code_content.appendSlice(code_line); | |
// Newline was part of the stored line if preprocessAndSplitLines ensures it. Yes. | |
} | |
code_block_node.content = try full_code_content.toOwnedSlice(); | |
errdefer if(code_block_node.content) |c| self.allocator.free(c); | |
try parent_node.appendChild(code_block_node); | |
return true; | |
} | |
} | |
} | |
// Not a closing fence, add to code lines. | |
// CommonMark: "The content of a code block is the text between the opening and closing fences, | |
// minus spaces used for indentation (if any), and minus the final line ending (if any)." | |
// Lines are unindented up to the indent of the opening fence line. | |
const unindented_code_line = stripIndent(current_content_line, indent, TAB_STOP); | |
try code_lines.append(try self.allocator.dupe(u8, unindented_code_line)); | |
self.advanceLine(); | |
} | |
// Reached EOF without closing fence. Treat as literal text. | |
// Rewind and return false. | |
if (info_string) |is| self.allocator.free(is); // free if allocated | |
self.current_line_idx = initial_line_idx; | |
return false; | |
} | |
fn tryParseIndentedCodeBlock(self: *Parser, parent_node: *Node) !bool { | |
// This is called when we've already determined it's likely an indented code block starter. | |
// (i.e., indent >= TAB_STOP, not blank, and not paragraph continuation context) | |
const initial_line_idx = self.current_line_idx; | |
var code_content_buf = ArrayList(u8).init(self.allocator); | |
defer code_content_buf.deinit(); | |
var consecutive_blank_lines_count: usize = 0; | |
var first_line = true; | |
while (self.currentLine()) |line| { | |
if (isBlankLine(line)) { | |
// Keep blank lines if they are part of the code block (i.e. not trailing) | |
// CommonMark: trailing blank lines are excluded. | |
// Any number of initial blank lines are ignored. | |
if (!first_line) { // Only count if not initial blank lines | |
try code_content_buf.appendSlice(try self.allocator.dupe(u8, "\n")); // Represent blank line | |
consecutive_blank_lines_count +=1; | |
} | |
self.advanceLine(); | |
continue; | |
} | |
const indent = getIndentWidth(line, TAB_STOP); | |
if (indent < TAB_STOP) { | |
// Line is not indented enough, code block ends. | |
break; | |
} | |
// Reset blank line counter if we have a non-blank indented line | |
consecutive_blank_lines_count = 0; | |
first_line = false; | |
// Add the line content, stripped of TAB_STOP indentation | |
const code_part = stripIndent(line, TAB_STOP, TAB_STOP); | |
try code_content_buf.appendSlice(code_part); // Assumes line includes its newline | |
self.advanceLine(); | |
} | |
if (first_line) { // No non-blank lines found, so not a code block | |
self.current_line_idx = initial_line_idx; | |
return false; | |
} | |
// Trim trailing blank lines that were added as `\n` | |
var final_len = code_content_buf.items.len; | |
while (final_len > 0 and consecutive_blank_lines_count > 0) { | |
if (final_len > 0 and code_content_buf.items[final_len-1] == '\n') { | |
final_len -=1; | |
consecutive_blank_lines_count -=1; | |
} else break; | |
} | |
code_content_buf.shrinkRetainingCapacity(final_len); | |
var code_block_node = Node.init(self.allocator, .CodeBlock); | |
errdefer code_block_node.deinit(); | |
code_block_node.content = try code_content_buf.toOwnedSlice(); // Already has newlines | |
errdefer if (code_block_node.content) |c| self.allocator.free(c); | |
try parent_node.appendChild(code_block_node); | |
return true; | |
} | |
fn tryParseParagraphOrSetext(self: *Parser, parent_node: *Node) !bool { | |
const initial_line_idx = self.current_line_idx; | |
var paragraph_lines = ArrayList([]const u8).init(self.allocator); | |
defer { | |
for(paragraph_lines.items) |l| self.allocator.free(l); | |
paragraph_lines.deinit(); | |
} | |
var paragraph_content_buf = ArrayList(u8).init(self.allocator); | |
defer paragraph_content_buf.deinit(); | |
var first_line = true; | |
var potential_setext_level: u8 = 0; | |
while (self.currentLine()) |line| { | |
if (isBlankLine(line)) { | |
break; // Blank line ends a paragraph | |
} | |
// Check if this line starts a new block type that interrupts paragraphs | |
if (!first_line) { // First line of a paragraph can be many things | |
const indent = getIndentWidth(line, TAB_STOP); | |
const line_after_indent = stripIndent(line, indent, TAB_STOP); | |
if (indent < TAB_STOP) { // Only check for interrupting blocks if not significantly indented (part of para) | |
if (isThematicBreakStart(line_after_indent)) break; | |
if (isAtxHeadingStart(line_after_indent)) break; | |
if (isFencedCodeBlockStart(line_after_indent)) break; | |
if (isBlockquoteStart(line_after_indent)) break; | |
if (isListItemStart(line_after_indent) != .NotListItem) break; | |
// HTML block, Link Ref Def already checked usually. | |
} | |
} | |
// Check for Setext underline on the *next* line | |
if (self.peekLine(1)) |next_line_peek| { | |
const setext_level = getSetextUnderlineLevel(next_line_peek); | |
if (setext_level > 0) { | |
// This is a Setext heading. The current line is its text. | |
// The paragraph_lines collected so far + current line = heading text. | |
try paragraph_lines.append(try self.allocator.dupe(u8, line)); | |
self.advanceLine(); // Consume current text line | |
self.advanceLine(); // Consume underline line | |
potential_setext_level = setext_level; | |
break; // Found Setext, exit loop. | |
} | |
} | |
// If not Setext, add line to paragraph | |
try paragraph_lines.append(try self.allocator.dupe(u8, line)); | |
self.advanceLine(); | |
first_line = false; | |
} | |
if (paragraph_lines.items.len == 0) { | |
// No lines collected, not a paragraph or setext. Rewind if lines were consumed by peeking logic (not the case here) | |
// self.current_line_idx = initial_line_idx; // Should be already correct or advanced by other parsers | |
return false; | |
} | |
// Join lines for content, trimming leading/trailing whitespace from each line and joining with space | |
for (paragraph_lines.items, 0..) |p_line, idx| { | |
const trimmed_line_content = trimLine(p_line); | |
try paragraph_content_buf.appendSlice(trimmed_line_content); | |
if (idx < paragraph_lines.items.len - 1) { | |
try paragraph_content_buf.append(' '); // Soft line break represented by space | |
} | |
} | |
const combined_text = mem.trim(u8, paragraph_content_buf.items, " "); // Trim final result | |
if (combined_text.len == 0) { // All lines were whitespace, or became empty | |
// This path means lines were consumed but resulted in no content. | |
// This is fine, effectively consumes whitespace lines not caught by tryParseBlankLines. | |
return true; // Consumed lines that formed an empty paragraph/heading | |
} | |
if (potential_setext_level > 0) { | |
var heading_node = Node.init(self.allocator, .Heading); | |
errdefer heading_node.deinit(); | |
heading_node.level = potential_setext_level; | |
try self.parseInlines(combined_text, &heading_node, .{}); | |
try parent_node.appendChild(heading_node); | |
} else { | |
var para_node = Node.init(self.allocator, .Paragraph); | |
errdefer para_node.deinit(); | |
try self.parseInlines(combined_text, ¶_node, .{}); | |
// Do not add empty paragraphs (e.g. only spaces, or if parseInlines yields nothing) | |
if (para_node.children.items.len > 0) { | |
try parent_node.appendChild(para_node); | |
} else { | |
para_node.deinit(); // Deinit if it would be empty | |
} | |
} | |
return true; | |
} | |
// --- Inline Parsing --- | |
// Simplified inline parsing. A full CommonMark inline parser is much more complex. | |
// This version handles basic cases without sophisticated delimiter stack. | |
const InlineContext = struct { | |
// Future: allow_links: bool, allow_emphasis: bool, etc. | |
}; | |
fn parseInlines(self: *Parser, text: []const u8, parent_node: *Node, context: InlineContext) !void { | |
_ = context; // Not used yet | |
var current_pos: usize = 0; | |
var text_segment_start: usize = current_pos; | |
const flushTextSegment = () !void { | |
if (text_segment_start < current_pos) { | |
var text_node = Node.init(self.allocator, .Text); | |
text_node.content = try self.allocator.dupe(u8, text[text_segment_start..current_pos]); | |
errdefer text_node.deinit(); | |
try parent_node.appendChild(text_node); | |
} | |
text_segment_start = current_pos; // Reset for next segment | |
}; | |
while (current_pos < text.len) { | |
const char = text[current_pos]; | |
var consumed_by_special_parser = false; | |
// 1. Escaped characters | |
if (char == '\\' and current_pos + 1 < text.len and isMarkdownPunctuation(text[current_pos + 1])) { | |
try flushTextSegment(); | |
var escaped_text_node = Node.init(self.allocator, .Text); | |
escaped_text_node.content = try self.allocator.dupe(u8, text[current_pos + 1 .. current_pos + 2]); | |
errdefer escaped_text_node.deinit(); | |
try parent_node.appendChild(escaped_text_node); | |
current_pos += 2; | |
text_segment_start = current_pos; | |
consumed_by_special_parser = true; | |
} | |
// 2. Inline Code Spans: `code` | |
else if (char == '`') { | |
const start_ticks_len = countLeadingChars(text[current_pos..], '`'); | |
var end_ticks_pos: ?usize = null; | |
var search_pos = current_pos + start_ticks_len; | |
while(search_pos < text.len) { | |
if (text[search_pos] == '`') { | |
const current_end_ticks_len = countLeadingChars(text[search_pos..], '`'); | |
if (current_end_ticks_len == start_ticks_len) { | |
end_ticks_pos = search_pos; | |
break; | |
} | |
search_pos += current_end_ticks_len; | |
} else { | |
search_pos +=1; | |
} | |
} | |
if (end_ticks_pos) |end_pos| { | |
try flushTextSegment(); | |
var code_node = Node.init(self.allocator, .InlineCode); | |
const code_content_start = current_pos + start_ticks_len; | |
const code_content_end = end_pos; | |
var code_content_slice = text[code_content_start..code_content_end]; | |
// CommonMark: "strip one space from beginning and end if surrounded by space and not all spaces" | |
if (code_content_slice.len > 1 and code_content_slice[0] == ' ' and code_content_slice[code_content_slice.len-1] == ' ' ) { | |
var all_spaces = true; | |
for(code_content_slice[1..code_content_slice.len-1]) |cc| if(cc != ' ') {all_spaces = false; break;} | |
if (!all_spaces) { | |
code_content_slice = code_content_slice[1..code_content_slice.len-1]; | |
} | |
} | |
code_node.content = try self.allocator.dupe(u8, code_content_slice); | |
errdefer code_node.deinit(); | |
try parent_node.appendChild(code_node); | |
current_pos = end_pos + start_ticks_len; | |
text_segment_start = current_pos; | |
consumed_by_special_parser = true; | |
} | |
// If no closing ticks, '`' is literal. Falls through. | |
} | |
// 3. Emphasis/Strong: *italic*, **bold**, _italic_, __bold__ (simplified) | |
// Strikethrough: ~~delete~~ | |
else if (char == '*' or char == '_' or char == '~') { | |
const marker_char = char; | |
var marker_len: usize = 1; | |
if (current_pos + 1 < text.len and text[current_pos + 1] == marker_char) { | |
if (marker_char == '~') { // only ~~ for strikethrough | |
marker_len = 2; | |
} else if (marker_char == '*' or marker_char == '_') { // ** or __ for strong | |
marker_len = 2; | |
} | |
} | |
// Try longest first (strong/strikethrough) | |
if (marker_len == 2) { | |
const end_marker = text[current_pos .. current_pos + 2]; | |
if (findAndParseEmphasisLike(self, text, current_pos, end_marker, parent_node, &text_segment_start, ¤t_pos)) { | |
consumed_by_special_parser = true; | |
} else if (marker_char == '*' or marker_char == '_') { // Fallback to single marker if double failed or not applicable | |
marker_len = 1; // Retry with single marker | |
const single_end_marker = text[current_pos .. current_pos + 1]; | |
if (findAndParseEmphasisLike(self, text, current_pos, single_end_marker, parent_node, &text_segment_start, ¤t_pos)) { | |
consumed_by_special_parser = true; | |
} | |
} | |
} else if (marker_len == 1 and (marker_char == '*' or marker_char == '_')) { // Single marker emphasis | |
const end_marker = text[current_pos .. current_pos + 1]; | |
if (findAndParseEmphasisLike(self, text, current_pos, end_marker, parent_node, &text_segment_start, ¤t_pos)) { | |
consumed_by_special_parser = true; | |
} | |
} | |
} | |
// 4. Links and Images: [text](url "title"),  | |
else if (char == '[' or (char == '!' and current_pos + 1 < text.len and text[current_pos + 1] == '[')) { | |
const is_image = (char == '!'); | |
const text_start_bracket = if (is_image) current_pos + 1 else current_pos; | |
// Find closing ']' for link text / alt text | |
var text_end_bracket: ?usize = null; | |
var bracket_balance: usize = 1; | |
var scan_pos = text_start_bracket + 1; | |
while(scan_pos < text.len) { | |
if (text[scan_pos] == '\\' and scan_pos + 1 < text.len) { // escaped char | |
scan_pos += 2; | |
continue; | |
} | |
if (text[scan_pos] == '[') bracket_balance += 1; | |
else if (text[scan_pos] == ']') { | |
bracket_balance -= 1; | |
if (bracket_balance == 0) { | |
text_end_bracket = scan_pos; | |
break; | |
} | |
} | |
scan_pos += 1; | |
} | |
if (text_end_bracket) |text_end_idx| { | |
const link_text_slice = text[text_start_bracket + 1 .. text_end_idx]; | |
scan_pos = text_end_idx + 1; | |
// Check for inline link: (url "title") | |
if (scan_pos < text.len and text[scan_pos] == '(') { | |
const url_part_start = scan_pos + 1; | |
var url_part_end: ?usize = null; | |
var paren_balance: usize = 1; | |
scan_pos = url_part_start; | |
while(scan_pos < text.len) { | |
if (text[scan_pos] == '\\' and scan_pos + 1 < text.len) { | |
scan_pos += 2; continue; | |
} | |
if (text[scan_pos] == '(') paren_balance += 1; | |
else if (text[scan_pos] == ')') { | |
paren_balance -= 1; | |
if (paren_balance == 0) { | |
url_part_end = scan_pos; | |
break; | |
} | |
} | |
scan_pos += 1; | |
} | |
if (url_part_end) |url_p_end| { | |
try flushTextSegment(); | |
const url_title_content = text[url_part_start..url_p_end]; | |
var dest_url: []const u8 = ""; | |
var link_title_val: ?[]const u8 = null; | |
// Parse destination and title from url_title_content | |
var content_scan_pos: usize = 0; | |
// Skip leading whitespace in () | |
while(content_scan_pos < url_title_content.len and ascii.isWhitespace(url_title_content[content_scan_pos])) : (content_scan_pos += 1) {} | |
const dest_start = content_scan_pos; | |
var dest_end = dest_start; | |
if (dest_start < url_title_content.len && url_title_content[dest_start] == '<') { // <url> | |
dest_end = dest_start + 1; | |
while(dest_end < url_title_content.len && url_title_content[dest_end] != '>') { | |
if (url_title_content[dest_end] == '\\' && dest_end + 1 < url_title_content.len) dest_end +=1; // skip escaped | |
dest_end +=1; | |
} | |
if (dest_end < url_title_content.len && url_title_content[dest_end] == '>') { | |
dest_url = url_title_content[dest_start+1..dest_end]; | |
dest_end +=1; // past '>' | |
} else { // malformed <url>, treat as non-angled | |
dest_end = dest_start; // reset | |
while(dest_end < url_title_content.len and !ascii.isWhitespace(url_title_content[dest_end])) { // up to whitespace or title | |
if (url_title_content[dest_end] == '(' || url_title_content[dest_end] == ')') break; // Stop if it's a paren for balance reasons | |
if (url_title_content[dest_end] == '\\' && dest_end + 1 < url_title_content.len) dest_end +=1; | |
dest_end +=1; | |
} | |
dest_url = url_title_content[dest_start..dest_end]; | |
} | |
} else { // bare url | |
while(dest_end < url_title_content.len and !ascii.isWhitespace(url_title_content[dest_end])) { | |
if (url_title_content[dest_end] == '(' || url_title_content[dest_end] == ')') break; | |
if (url_title_content[dest_end] == '\\' && dest_end + 1 < url_title_content.len) dest_end +=1; | |
dest_end +=1; | |
} | |
dest_url = url_title_content[dest_start..dest_end]; | |
} | |
content_scan_pos = dest_end; | |
// Skip whitespace between URL and title | |
while(content_scan_pos < url_title_content.len and ascii.isWhitespace(url_title_content[content_scan_pos])) : (content_scan_pos += 1) {} | |
if (content_scan_pos < url_title_content.len) { // Potential title | |
const title_q_char = url_title_content[content_scan_pos]; | |
if (title_q_char == '"' or title_q_char == '\'' or title_q_char == '(') { | |
const title_closing_char = if (title_q_char == '(') ')' else title_q_char; | |
const title_text_start = content_scan_pos + 1; | |
var title_text_end = title_text_start; | |
while(title_text_end < url_title_content.len) { | |
if (url_title_content[title_text_end] == '\\' and title_text_end + 1 < url_title_content.len) { | |
title_text_end += 2; continue; | |
} | |
if (url_title_content[title_text_end] == title_closing_char) break; | |
title_text_end +=1; | |
} | |
if (title_text_end < url_title_content.len and url_title_content[title_text_end] == title_closing_char) { | |
link_title_val = try self.allocator.dupe(u8, url_title_content[title_text_start..title_text_end]); | |
// TODO: unescape title | |
content_scan_pos = title_text_end + 1; | |
} | |
} | |
} | |
// Skip trailing whitespace in () | |
while(content_scan_pos < url_title_content.len and ascii.isWhitespace(url_title_content[content_scan_pos])) : (content_scan_pos += 1) {} | |
if (content_scan_pos == url_title_content.len) { // Parsed successfully | |
var node = Node.init(self.allocator, if (is_image) .Image else .Link); | |
errdefer node.deinit(); | |
node.url = try self.allocator.dupe(u8, dest_url); // TODO: unescape URL | |
errdefer if(node.url) |u| self.allocator.free(u); | |
node.title = link_title_val; // Already dupe'd or null | |
errdefer if(node.title) |t| self.allocator.free(t); | |
if (is_image) { | |
node.alt_text = try self.allocator.dupe(u8, link_text_slice); // TODO: unescape alt | |
errdefer if(node.alt_text) |al| self.allocator.free(al); | |
} else { | |
try self.parseInlines(link_text_slice, &node, context); | |
} | |
try parent_node.appendChild(node); | |
current_pos = url_p_end + 1; | |
text_segment_start = current_pos; | |
consumed_by_special_parser = true; | |
} else { | |
// Malformed inline link, treat '[' as literal. Falls through. | |
if (link_title_val) |ltv| self.allocator.free(ltv); | |
} | |
} | |
} | |
// TODO: Reference links [text][label], [label][], [label] | |
// This requires looking up `label` in `self.link_references`. | |
// For brevity here, only inline links are handled. | |
} | |
} | |
// 5. Hard Line Breaks ( \n) - this is tricky here as newlines are usually preprocessed. | |
// If `text` comes from a paragraph that joined lines with spaces, this info is lost. | |
// If `text` can contain `\n`, check for ` \n`. | |
// The current paragraph construction joins with spaces, so hard breaks this way are not parseable from `combined_text`. | |
// A more robust parser might pass line objects to parseInlines or handle breaks earlier. | |
// For now, assume `\n` in `text` might represent a explicit request for a break (e.g. from an AST transform). | |
// If we want to parse ` \n` as hard break: | |
// A paragraph's content should not be pre-joined with spaces but passed with original newlines. | |
// Then here: if text[current_pos-2..current_pos+1] == " \n" | |
else if (char == '\n') { // Simplified: treat all newlines in inline content as potential line breaks | |
// This is not standard Markdown behavior for typical inline parsing. | |
// Usually, newlines in paragraph source are soft breaks. | |
// This indicates `text` was not fully normalized or comes from a source where `\n` is significant. | |
// Let's assume for this simplified parser, explicit `\n` in the input `text` to `parseInlines` | |
// should become a line break node. This means `tryParseParagraphOrSetext` should probably | |
// not join lines with spaces but rather with `\n` and let parseInlines handle soft/hard. | |
// For now: make `\n` a simple `Text` node with newline content. Renderer can decide. | |
// Or, if two spaces precede it: | |
if (current_pos >=2 and text[current_pos-1] == ' ' and text[current_pos-2] == ' ') { | |
try flushTextSegment(); // Flush text before the two spaces | |
text_segment_start = current_pos - 2; // Point to start of " " | |
try flushTextSegment(); // Flush the " " as text or discard | |
text_segment_start = current_pos; // Reset | |
var br_node = Node.init(self.allocator, .LineBreak); | |
errdefer br_node.deinit(); | |
try parent_node.appendChild(br_node); | |
current_pos += 1; // consume \n | |
text_segment_start = current_pos; | |
consumed_by_special_parser = true; | |
} | |
} | |
// TODO: Autolinks <http://foo.bar>, <mailto:[email protected]> | |
// TODO: Raw HTML tags <a>...</a> | |
if (!consumed_by_special_parser) { | |
current_pos += 1; | |
} | |
} | |
// Flush any remaining text | |
const final_pos = current_pos; // Save before flushTextSegment potentially changes current_pos view | |
current_pos = text.len; // Ensure flushTextSegment flushes up to the end | |
try flushTextSegment(); | |
current_pos = final_pos; // Restore (though not strictly needed as loop ends) | |
text_segment_start = current_pos; | |
} | |
fn findAndParseEmphasisLike(self: *Parser, text: []const u8, start_pos: usize, marker_slice: []const u8, parent_node: *Node, text_segment_start_ptr: *usize, current_pos_ptr: *usize) !bool { | |
var text_segment_start = text_segment_start_ptr.*; | |
var current_pos = current_pos_ptr.*; | |
const flushTextSegment = () !void { | |
if (text_segment_start < start_pos) { // Flush text before the marker | |
var text_node = Node.init(self.allocator, .Text); | |
text_node.content = try self.allocator.dupe(u8, text[text_segment_start..start_pos]); | |
errdefer text_node.deinit(); | |
try parent_node.appendChild(text_node); | |
} | |
text_segment_start = start_pos; // Reset for next segment, before marker itself | |
}; | |
var search_offset = start_pos + marker_slice.len; | |
while (search_offset < text.len) { | |
if (text[search_offset] == marker_slice[0]) { // Potential end marker | |
if (text.len >= search_offset + marker_slice.len and | |
mem.eql(u8, text[search_offset .. search_offset + marker_slice.len], marker_slice)) | |
{ | |
// Found matching end marker. Basic validation: | |
// Cannot be empty content: search_offset > start_pos + marker_slice.len | |
// GFM rules for flanking are complex. This is simplified. | |
if (search_offset == start_pos + marker_slice.len) { // Empty, e.g. ****, __ __ | |
search_offset += marker_slice.len; // Skip and continue search | |
continue; | |
} | |
try flushTextSegment(); // Flush text before marker | |
const node_type = if (marker_slice.len == 2) { | |
if (marker_slice[0] == '~') .Strikethrough else .Strong | |
} else .Emphasis; | |
var emph_node = Node.init(self.allocator, node_type); | |
errdefer emph_node.deinit(); | |
const inner_text = text[start_pos + marker_slice.len .. search_offset]; | |
try self.parseInlines(inner_text, &emph_node, .{}); | |
// Only add if it actually contains something (or is an image, which is childless) | |
if (emph_node.children.items.len > 0) { | |
try parent_node.appendChild(emph_node); | |
current_pos = search_offset + marker_slice.len; | |
text_segment_start_ptr.* = current_pos; | |
current_pos_ptr.* = current_pos; | |
return true; | |
} else { | |
emph_node.deinit(); // Empty emphasis, treat as literal | |
// Fall through to treat markers as literal | |
return false; // Could not form valid emphasis | |
} | |
} | |
} | |
// Handle escaped markers within content: | |
if (text[search_offset] == '\\' and search_offset + 1 < text.len) { | |
search_offset += 2; | |
} else { | |
search_offset += 1; | |
} | |
} | |
return false; // No matching end marker found | |
} | |
// --- Block helper predicates --- | |
fn isThematicBreakStart(line_content: []const u8) bool { | |
// Simplified check, actual parsing is in tryParseThematicBreak | |
var marker_char: u8 = 0; | |
var count: usize = 0; | |
var non_marker_found = false; | |
for (line_content) |char| { | |
if (char == '*' or char == '-' or char == '_') { | |
if (marker_char == 0) marker_char = char; | |
else if (char != marker_char) { non_marker_found = true; break; } | |
count += 1; | |
} else if (char == ' ' or char == '\t') {} | |
else if (char == '\n' or char == '\r') break; | |
else { non_marker_found = true; break; } | |
} | |
return !non_marker_found and count >= 3; | |
} | |
fn isAtxHeadingStart(line_content: []const u8) bool { | |
if (line_content.len == 0) return false; | |
return line_content[0] == '#'; | |
} | |
fn isFencedCodeBlockStart(line_content: []const u8) bool { | |
if (line_content.len < 3) return false; | |
const c = line_content[0]; | |
if (c != '`' and c != '~') return false; | |
return line_content[1] == c and line_content[2] == c; | |
} | |
const ListItemType = enum { NotListItem, Unordered, Ordered }; | |
fn isListItemStart(line_content: []const u8) ListItemType { | |
var i: usize = 0; | |
// Skip leading spaces on the content line (already stripped by indent) | |
while(i < line_content.len and line_content[i] == ' ') : (i +=1); | |
if (i >= line_content.len) return .NotListItem; | |
// Unordered | |
if (line_content[i] == '*' or line_content[i] == '-' or line_content[i] == '+') { | |
if (i + 1 < line_content.len and (line_content[i+1] == ' ' or line_content[i+1] == '\t' or line_content[i+1] == '\n')) { | |
return .Unordered; | |
} | |
} | |
// Ordered | |
var num_start = i; | |
while(i < line_content.len and ascii.isDigit(line_content[i])) : (i +=1); | |
if (i > num_start and i < line_content.len and (line_content[i] == '.' or line_content[i] == ')')) { | |
if (i + 1 < line_content.len and (line_content[i+1] == ' ' or line_content[i+1] == '\t' or line_content[i+1] == '\n')) { | |
// CommonMark: ordered list marker max 9 digits | |
if (i - num_start <= 9) { | |
return .Ordered; | |
} | |
} | |
} | |
return .NotListItem; | |
} | |
fn isBlockquoteStart(line_content: []const u8) bool { | |
if (line_content.len == 0) return false; | |
return line_content[0] == '>'; | |
} | |
fn getSetextUnderlineLevel(line: []const u8) u8 { | |
const indent = getIndentWidth(line, TAB_STOP); | |
if (indent >= TAB_STOP) return 0; | |
const content = stripIndent(line, indent, TAB_STOP); | |
if (content.len == 0) return 0; | |
const marker = content[0]; | |
if (marker != '=' and marker != '-') return 0; | |
for (content) |char| { | |
if (char != marker and char != ' ' and char != '\t' and char != '\n' and char != '\r') return 0; | |
if (char == '\n' or char == '\r') break; | |
} | |
// Must contain at least one marker char | |
var has_marker = false; | |
for (content) |char| if(char == marker) {has_marker = true; break;} | |
if(!has_marker) return 0; | |
return if (marker == '=') 1 else 2; | |
} | |
// --- Complex Block Parsers (Blockquote, Lists) --- | |
// These often involve recursive parsing of their contents. | |
fn tryParseBlockquote(self: *Parser, parent_node: *Node) !bool { | |
const initial_line_idx = self.current_line_idx; | |
var line = self.currentLine() orelse return false; | |
const indent = getIndentWidth(line, TAB_STOP); | |
if (indent >= TAB_STOP) return false; | |
var content_line = stripIndent(line, indent, TAB_STOP); | |
if (!isBlockquoteStart(content_line)) return false; | |
var quote_node = Node.init(self.allocator, .Blockquote); | |
errdefer quote_node.deinit(); | |
var lines_for_blockquote_content = ArrayList([]const u8).init(self.allocator); | |
defer { | |
for(lines_for_blockquote_content.items) |l| self.allocator.free(l); | |
lines_for_blockquote_content.deinit(); | |
} | |
while (self.currentLine()) |current_bq_line| { | |
const current_bq_line_indent = getIndentWidth(current_bq_line, TAB_STOP); | |
var current_bq_content_line = stripIndent(current_bq_line, current_bq_line_indent, TAB_STOP); | |
if (isBlockquoteStart(current_bq_content_line)) { | |
// Consume '>' and optional space | |
current_bq_content_line = current_bq_content_line[1..]; | |
if (current_bq_content_line.len > 0 and (current_bq_content_line[0] == ' ' or current_bq_content_line[0] == '\t')) { | |
current_bq_content_line = current_bq_content_line[1..]; | |
} | |
try lines_for_blockquote_content.append(try self.allocator.dupe(u8, current_bq_content_line)); | |
self.advanceLine(); | |
} else { | |
// Check for lazy continuation: if line is not blank and doesn't start another block type | |
// that would break the blockquote (e.g. thematic break, heading, fenced code block) | |
if (isBlankLine(current_bq_line)) break; // Blank line ends blockquote unless nested. This is simplified. | |
// If the line could be a thematic break, ATX heading, fenced code block start, etc., | |
// it terminates the current blockquote. | |
if (isThematicBreakStart(current_bq_content_line) or | |
isAtxHeadingStart(current_bq_content_line) or | |
isFencedCodeBlockStart(current_bq_content_line) or | |
getSetextUnderlineLevel(current_bq_content_line) > 0) // Check if it's a setext underline | |
{ | |
break; | |
} | |
// Check if it's a new list item or another blockquote (these typically don't lazily continue paras in bq) | |
if (isListItemStart(current_bq_content_line) != .NotListItem) break; | |
// If it's not a marker line, but parent is quote, and this line is part of a paragraph | |
// This is "lazy continuation". Add it. | |
// A more robust check would ensure that this line doesn't start a *different* block type. | |
// For simplicity: any non-`>` prefixed line that's not blank and not an interrupting block type continues. | |
try lines_for_blockquote_content.append(try self.allocator.dupe(u8, current_bq_line)); // Keep original indent for sub-parsing | |
self.advanceLine(); | |
} | |
} | |
if (lines_for_blockquote_content.items.len == 0) { | |
quote_node.deinit(); | |
self.current_line_idx = initial_line_idx; | |
return false; | |
} | |
// Create a sub-parser for the blockquote content | |
if (self.recursion_depth >= MAX_RECURSION_DEPTH) { | |
// TODO: How to handle this error? Maybe just parse as text. | |
// For now, skip appending. | |
dbgPrint("Max recursion depth reached in blockquote.\n", .{}); | |
return true; // Consumed lines, but didn't produce valid content due to depth. | |
} | |
self.recursion_depth += 1; | |
defer self.recursion_depth -= 1; | |
// Need to reconstruct the text for the sub-parser | |
var sub_parser_text_buf = ArrayList(u8).init(self.allocator); | |
defer sub_parser_text_buf.deinit(); | |
for(lines_for_blockquote_content.items) |l| { | |
try sub_parser_text_buf.appendSlice(l); | |
// Ensure newlines if not already there (dupe'd lines should have them from splitLines) | |
} | |
if (sub_parser_text_buf.items.len > 0) { | |
var sub_parser = try Parser.init(self.allocator, try sub_parser_text_buf.toOwnedSlice()); | |
defer sub_parser.deinit(); | |
// Copy link references | |
var ref_iter = self.link_references.iterator(); | |
while(ref_iter.next()) |entry| { | |
try sub_parser.link_references.put( | |
try self.allocator.dupe(u8, entry.key_ptr.*), | |
LinkRef { | |
.url = try self.allocator.dupe(u8, entry.value_ptr.url), | |
.title = if (entry.value_ptr.title) |t| try self.allocator.dupe(u8, t) else null, | |
} | |
); | |
} | |
var sub_document_node = try sub_parser.parseDocument(); // This creates a Document node | |
// Transfer children from sub_document_node to quote_node | |
for (sub_document_node.children.items) |child_node| { | |
try quote_node.appendChild(child_node); // This moves ownership if Node is not copyable | |
} | |
sub_document_node.children.items = undefined; // Nullify to prevent double deinit by sub_document_node.deinit() | |
sub_document_node.deinit(); | |
} | |
try parent_node.appendChild(quote_node); | |
return true; | |
} | |
// tryParseUnorderedList and tryParseOrderedList are very complex due to nesting, | |
// lazy continuation, and determining "tight" vs "loose". | |
// This is a simplified version. | |
fn tryParseUnorderedList(self: *Parser, parent_node: *Node, current_block_indent: usize) !bool { | |
return try self.tryParseList(.Unordered, parent_node, current_block_indent); | |
} | |
fn tryParseOrderedList(self: *Parser, parent_node: *Node, current_block_indent: usize) !bool { | |
return try self.tryParseList(.Ordered, parent_node, current_block_indent); | |
} | |
fn tryParseList(self: *Parser, list_type_check: ListItemType, parent_node: *Node, current_block_indent: usize) !bool { | |
const initial_line_idx = self.current_line_idx; | |
var line = self.currentLine() orelse return false; | |
var item_indent = getIndentWidth(line, TAB_STOP); | |
var content_after_indent = stripIndent(line, item_indent, TAB_STOP); | |
const current_item_type_info = self.getListItemInfo(content_after_indent); | |
if (current_item_type_info.item_type != list_type_check) return false; | |
if (item_indent < current_block_indent) return false; // Must be at least same indent level | |
var list_node = Node.init(self.allocator, if (list_type_check == .Unordered) .UnorderedList else .OrderedList); | |
errdefer list_node.deinit(); | |
if (list_type_check == .Ordered) { | |
list_node.start_number = current_item_type_info.number; | |
} | |
var is_tight = true; // Assume tight initially | |
// Loop for list items | |
while (self.currentLine()) |item_line_loop| { | |
const loop_item_indent = getIndentWidth(item_line_loop, TAB_STOP); | |
const loop_content_after_indent = stripIndent(item_line_loop, loop_item_indent, TAB_STOP); | |
const loop_item_info = self.getListItemInfo(loop_content_after_indent); | |
if (loop_item_info.item_type != list_type_check) { // Not this kind of list item | |
// Could it be a continuation of the *previous* list item? | |
// Or does it end the list? | |
// If it's blank, it might indicate loose list or end. | |
// If it's indented sufficiently, it's a continuation. | |
if (isBlankLine(item_line_loop)) { | |
// A blank line *between* list items makes the list loose. | |
// If we have items already, this blank line might separate. | |
if (list_node.children.items.len > 0) { | |
// Peek ahead: if next line is another list item of same type/indent, this blank line makes list loose. | |
if (self.peekLine(1)) |peeked_line| { | |
const peek_indent = getIndentWidth(peeked_line, TAB_STOP); | |
const peek_content = stripIndent(peeked_line, peek_indent, TAB_STOP); | |
const peek_info = self.getListItemInfo(peek_content); | |
if (peek_info.item_type == list_type_check and peek_indent >= item_indent) { // item_indent of first item | |
is_tight = false; | |
} | |
} | |
} | |
// A blank line might also just be absorbed by an item if it's part of its content. | |
// This simple parser will have it terminate the list if not clearly continuable. | |
break; // End list on blank line for simplicity here. | |
} | |
if (loop_item_indent < item_indent) break; // Dedent ends the list. | |
// else: it's indented enough to be content of previous item. Fall through to item parsing. | |
} | |
// If we are here, it's either a new list item of the expected type, | |
// or content that should be part of the current (being parsed) list item. | |
// This logic is for starting a *new* item or breaking. | |
// The actual item content parsing is below. | |
if (loop_item_info.item_type == list_type_check && loop_item_indent >= item_indent) { | |
// It's a new item of the same list. | |
// item_indent needs to be updated if this new item is less indented but still valid. | |
item_indent = loop_item_indent; // The current item dictates the base indent for its content. | |
} else { | |
// Not a new item of same type/indent. This means it's either content for the *previous* item | |
// or the list ends. This loop structure implies list ends. | |
// The logic to gather multi-line item content needs to be inside item parsing. | |
break; | |
} | |
// Parse this list item | |
var list_item_node = Node.init(self.allocator, .ListItem); | |
errdefer list_item_node.deinit(); | |
if (loop_item_info.is_task_list) { | |
list_item_node.is_task_list_item = true; | |
list_item_node.is_task_list_item_checked = loop_item_info.is_task_checked; | |
} | |
// Collect lines for this single list item | |
var item_content_lines = ArrayList([]const u8).init(self.allocator); | |
defer { | |
for(item_content_lines.items) |l| self.allocator.free(l); | |
item_content_lines.deinit(); | |
} | |
// First line of item content (after marker) | |
const first_content_part = loop_content_after_indent[loop_item_info.marker_len ..]; | |
try item_content_lines.append(try self.allocator.dupe(u8, mem.trimLeft(u8, first_content_part, " \t"))); | |
self.advanceLine(); | |
// Subsequent lines for the item | |
const content_indent_needed = item_indent + loop_item_info.marker_len + 1; // Approx. May depend on tab usage. | |
// More robust: indent needed is just past marker. | |
// Or, for subsequent lines, at least item_indent + some_minimum (e.g., 2 spaces). | |
// CommonMark: continuation indent is complex. | |
// Simplified: subsequent lines indented same or more than first content line. | |
while(self.currentLine()) |continuation_line| { | |
const cont_indent = getIndentWidth(continuation_line, TAB_STOP); | |
if (isBlankLine(continuation_line)) { | |
// A blank line. If followed by properly indented content, it's part of this item (making it loose). | |
// If followed by new item or end of list, it separates. | |
if (self.peekLine(1)) |peek_cont| { | |
const peek_cont_indent = getIndentWidth(peek_cont, TAB_STOP); | |
// If next line is new item or less indented, this blank line is a separator. | |
const peek_cont_content = stripIndent(peek_cont, peek_cont_indent, TAB_STOP); | |
const peek_cont_info = self.getListItemInfo(peek_cont_content); | |
if (peek_cont_info.item_type == list_type_check && peek_cont_indent >= item_indent) { // Next is another item | |
is_tight = false; // Blank line between current item and next. | |
break; // End current item's content. | |
} | |
if (peek_cont_indent < item_indent + 2) { // Arbitrary continuation indent threshold | |
is_tight = false; | |
break; // End current item. | |
} | |
} else { // EOF after blank | |
is_tight = false; | |
break; | |
} | |
// Otherwise, this blank line is part of current item's content. | |
try item_content_lines.append(try self.allocator.dupe(u8, "\n")); // Represent blank line | |
is_tight = false; // Blank line within an item's content makes list loose. | |
self.advanceLine(); | |
continue; | |
} | |
// If this line starts a new list item of the same type/level, current item ends. | |
const next_item_content = stripIndent(continuation_line, cont_indent, TAB_STOP); | |
const next_item_info = self.getListItemInfo(next_item_content); | |
if (next_item_info.item_type == list_type_check && cont_indent == item_indent) { | |
break; // Start of a new sibling item | |
} | |
if (cont_indent < item_indent || cont_indent < current_block_indent + 2 ) { // Heuristic for continuation indent. | |
// current_block_indent + TAB_STOP is safer for code blocks. | |
// For simple text, current_block_indent + loop_item_info.marker_len could be it. | |
break; // Not indented enough to be a continuation. | |
} | |
// Add stripped line. The amount to strip is `item_indent + marker_width_equivalent` (usually marker_len + 1 space) | |
// or just `item_indent + some_fixed_amount` for subsequent paras. | |
// Simplified: strip up to `item_indent + 2` or `content_indent_needed`. | |
// A common heuristic is to strip by the indent of the first line of content. | |
// Or by `item_indent + marker_len + (1 if space after marker)`. | |
const effective_strip_indent = @min(cont_indent, item_indent + loop_item_info.marker_len + 1); // Approximation | |
try item_content_lines.append(try self.allocator.dupe(u8, stripIndent(continuation_line, effective_strip_indent, TAB_STOP))); | |
self.advanceLine(); | |
} | |
// Parse collected lines for the item. | |
// This is where a sub-parser or recursive call to block parsing for item_content_lines would go. | |
// For simplicity, treat item content as single paragraph with inlines. | |
var item_full_content_buf = ArrayList(u8).init(self.allocator); | |
defer item_full_content_buf.deinit(); | |
for(item_content_lines.items, 0..) |icl_idx, i| { | |
try item_full_content_buf.appendSlice(trimLine(icl_idx)); | |
if (i < item_content_lines.items.len -1) { | |
try item_full_content_buf.append(' '); | |
} | |
} | |
if (item_full_content_buf.items.len > 0) { | |
try self.parseInlines(item_full_content_buf.items, &list_item_node, .{}); | |
} | |
// If an item contains a blank line, or if there's a blank line between items, list is loose. | |
// (is_tight already tracks this) | |
try list_node.appendChild(list_item_node); | |
} | |
if (list_node.children.items.len == 0) { | |
list_node.deinit(); | |
self.current_line_idx = initial_line_idx; | |
return false; | |
} | |
list_node.tight = is_tight; | |
try parent_node.appendChild(list_node); | |
return true; | |
} | |
const ListItemInfo = struct { | |
item_type: ListItemType, | |
marker_len: usize, | |
number: ?u64, // For ordered lists | |
is_task_list: bool = false, | |
is_task_checked: bool = false, | |
}; | |
fn getListItemInfo(self: *Parser, line_content: []const u8) ListItemInfo { | |
_ = self; | |
var i: usize = 0; | |
// Skip leading spaces on the line_content itself (already indent-stripped) | |
// while(i < line_content.len and line_content[i] == ' ') : (i +=1); // Should not be needed if line_content is properly prepared | |
// if (i >= line_content.len) return .{ .item_type = .NotListItem, .marker_len = 0, .number = null }; | |
const original_i = i; | |
// Unordered: *, -, + | |
if (i < line_content.len and (line_content[i] == '*' or line_content[i] == '-' or line_content[i] == '+')) { | |
const marker_char = line_content[i]; | |
i += 1; | |
if (i < line_content.len and (line_content[i] == ' ' or line_content[i] == '\t' or line_content[i] == '\n')) { | |
// Check for task list item: `* [ ] ` or `* [x] ` | |
var after_marker_and_space = i + 1; // Skip the space/tab | |
while(after_marker_and_space < line_content.len and (line_content[after_marker_and_space] == ' ' or line_content[after_marker_and_space] == '\t')) : (after_marker_and_space+=1); | |
if (after_marker_and_space + 2 < line_content.len and line_content[after_marker_and_space] == '[') { | |
const cb_content = line_content[after_marker_and_space+1]; | |
if ((cb_content == ' ' or cb_content == 'x' or cb_content == 'X') and | |
line_content[after_marker_and_space+2] == ']') | |
{ | |
// Potential task list. Must be followed by space or end of line. | |
if (after_marker_and_space + 3 == line_content.len or line_content[after_marker_and_space+3] == ' ' or line_content[after_marker_and_space+3] == '\t' or line_content[after_marker_and_space+3] == '\n') { | |
return .{ | |
.item_type = .Unordered, | |
.marker_len = (after_marker_and_space + 3) - original_i, // Marker includes `* [ ] ` | |
.number = null, | |
.is_task_list = true, | |
.is_task_checked = (cb_content == 'x' or cb_content == 'X'), | |
}; | |
} | |
} | |
} | |
return .{ .item_type = .Unordered, .marker_len = i - original_i, .number = null }; | |
} | |
i = original_i; // Reset if not valid marker | |
} | |
// Ordered: 1. 1) | |
const num_start = i; | |
while(i < line_content.len and ascii.isDigit(line_content[i])) : (i +=1); | |
if (i > num_start and i < line_content.len and (line_content[i] == '.' or line_content[i] == ')')) { | |
const num_str = line_content[num_start..i]; | |
const number = std.fmt.parseUnsigned(u64, num_str, 10) catch |err| { | |
// Should not happen if ascii.isDigit passed | |
std.debug.print("Error parsing list number: {s} ({any})\n", .{num_str, err}); | |
return .{ .item_type = .NotListItem, .marker_len = 0, .number = null }; | |
}; | |
i += 1; // consume . or ) | |
if (i < line_content.len and (line_content[i] == ' ' or line_content[i] == '\t' or line_content[i] == '\n')) { | |
if (i - num_start -1 <= 9) { // number part <= 9 digits | |
return .{ .item_type = .Ordered, .marker_len = i - original_i, .number = number }; | |
} | |
} | |
i = original_i; // Reset | |
} | |
return .{ .item_type = .NotListItem, .marker_len = 0, .number = null }; | |
} | |
fn tryParseHtmlBlock(self: *Parser, parent_node: *Node) !bool { | |
// This is a very simplified HTML block parser. CommonMark has 7 types of HTML blocks. | |
// This will try to match simple <tag>...</tag> or self-closing <tag /> on a line. | |
const initial_line_idx = self.current_line_idx; | |
var line = self.currentLine() orelse return false; | |
const indent = getIndentWidth(line, TAB_STOP); | |
if (indent >= TAB_STOP) return false; | |
const content = mem.trimLeft(u8, stripIndent(line, indent, TAB_STOP), " \t"); | |
if (content.len == 0 or content[0] != '<') return false; | |
// Try to identify some common block tags that CM considers HTML Block Type 1 or 6 | |
const block_tags = [_][]const u8{ | |
"pre", "script", "style", "textarea", // Type 1 (content is literal until closing tag) | |
"address", "article", "aside", "base", "basefont", "blockquote", "body", | |
"caption", "center", "col", "colgroup", "dd", "details", "dialog", | |
"dir", "div", "dl", "dt", "fieldset", "figcaption", "figure", | |
"footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", | |
"head", "header", "hr", "html", "iframe", "legend", "li", "link", | |
"main", "menu", "menuitem", "nav", "noframes", "ol", | |
"optgroup", "option", "p", "param", "section", "source", "summary", | |
"table", "tbody", "td", "tfoot", "th", "thead", "title", "tr", "track", "ul", | |
}; | |
var tag_name_end: usize = 1; | |
while(tag_name_end < content.len and ascii.isAlphanumeric(content[tag_name_end])) : (tag_name_end+=1) {} | |
if (tag_name_end == 1) return false; // No tag name like `<>` or `<1` | |
const tag_name = content[1..tag_name_end]; | |
var is_known_block_tag = false; | |
for(block_tags) |bt| { | |
if (mem.eql(u8, ascii.lowerSlice(bt), ascii.lowerSlice(tag_name))) { // Case-insensitive tag match | |
is_known_block_tag = true; | |
break; | |
} | |
} | |
if (!is_known_block_tag) return false; // Not a recognized block-level HTML tag start | |
// Type 1 (pre, script, style, textarea): content is literal until closing tag | |
// Type 6 (other block tags): ends with a blank line. | |
// This simplified parser will treat all matched tags as ending with a blank line, or a closing tag on its own line. | |
var html_content_buf = ArrayList(u8).init(self.allocator); | |
defer html_content_buf.deinit(); | |
var current_html_line_idx = initial_line_idx; | |
while(self.input_lines.items.len > current_html_line_idx) { | |
const current_html_line = self.input_lines.items[current_html_line_idx]; | |
try html_content_buf.appendSlice(current_html_line); // Includes newline | |
// Simplistic: end on blank line or if we see a matching closing tag for some specific types | |
if (current_html_line_idx > initial_line_idx && isBlankLine(current_html_line)) { | |
current_html_line_idx +=1; // consume blank line | |
break; | |
} | |
// For tags like <script>, <style>, <pre>, look for their specific closing tag. | |
if (mem.eql(u8, tag_name, "script") or mem.eql(u8, tag_name, "style") or mem.eql(u8, tag_name, "pre") or mem.eql(u8, tag_name, "textarea")) { | |
var closing_tag_buf = ArrayList(u8).init(self.allocator); | |
defer closing_tag_buf.deinit(); | |
try closing_tag_buf.appendSlice("</"); | |
try closing_tag_buf.appendSlice(tag_name); | |
try closing_tag_buf.append('>'); | |
if (mem.indexOf(u8, ascii.lowerSlice(current_html_line), ascii.lowerSlice(closing_tag_buf.items)) != null) { | |
current_html_line_idx +=1; // consume line with closing tag | |
break; | |
} | |
} | |
current_html_line_idx +=1; | |
} | |
if (html_content_buf.items.len == 0) return false; | |
var html_node = Node.init(self.allocator, .HtmlBlock); | |
errdefer html_node.deinit(); | |
html_node.content = try html_content_buf.toOwnedSlice(); | |
errdefer if(html_node.content) |c| self.allocator.free(c); | |
try parent_node.appendChild(html_node); | |
self.current_line_idx = current_html_line_idx; // Advance main parser state | |
return true; | |
} | |
}; // End Parser struct | |
fn isMarkdownPunctuation(char: u8) bool { | |
return switch (char) { | |
'!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', | |
':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~' => true, | |
else => false, | |
}; | |
} | |
pub fn parseMarkdown(allocator: Allocator, markdown_text: []const u8) !Node { | |
var parser = try Parser.init(allocator, markdown_text); | |
defer parser.deinit(); | |
return parser.parseDocument(); | |
} | |
pub fn main() !void { | |
var gpa = std.heap.GeneralPurposeAllocator(.{}){}; | |
defer _ = gpa.deinit(); | |
const allocator = gpa.allocator(); | |
const markdown_input = comptime blk: { | |
var buf: [10000]u8 = undefined; | |
var stream = std.io.fixedBufferStream(&buf); | |
const writer = stream.writer(); | |
try writer.writeAll( | |
\\# Heading 1 | |
\\ | |
\\This is a paragraph with *italic* and **bold** text. | |
\\And `inline code`. Here's a [link](http://example.com "Example"). | |
\\And an image . | |
\\ | |
\\> Blockquote line 1 | |
\\> Blockquote line 2 | |
\\> > Nested blockquote | |
\\> Back to first level. | |
\\ | |
\\```javascript | |
\\function hello() { | |
\\ console.log("Hello, Fenced Code!"); | |
\\} | |
\\``` | |
\\ | |
\\ function indentedCode() { | |
\\ return true; | |
\\ } | |
\\ | |
\\--- | |
\\ | |
\\* Unordered item 1 | |
\\* Unordered item 2 | |
\\ * Nested item 2.1 (Tight list handling is simplified) | |
\\ * Deeper Nested 2.1.1 | |
\\* Unordered item 3 | |
\\ | |
\\1. Ordered item 1 | |
\\2. Ordered item 2 | |
\\ With a continuation. | |
\\300. Ordered item 3 (starts at 300) | |
\\ | |
\\A paragraph that will become | |
\\============================= | |
\\A Setext L1 Heading. | |
\\ | |
\\Another para for L2 | |
\\--------------------- | |
\\ | |
\\Strikethrough ~~deleted text~~. | |
\\ | |
\\[refdef]: /url/ "My Ref Title" | |
\\This is a ref [link][refdef]. And [refdef][]. And [implicit refdef]. | |
\\ | |
\\<p>This is an HTML block.</p> | |
\\<div> | |
\\ Another HTML block line. | |
\\</div> | |
\\ | |
\\* [ ] Unchecked task | |
\\* [x] Checked task | |
\\ | |
); | |
break :blk buf[0..stream.pos]; | |
}; | |
std.debug.print("---MARKDOWN INPUT---\n{s}\n---END INPUT---\n\n", .{markdown_input}); | |
const ast_root = try parseMarkdown(allocator, markdown_input); | |
defer ast_root.deinit(); | |
std.debug.print("---AST OUTPUT---\n", .{}); | |
ast_root.print(0); | |
std.debug.print("---END AST OUTPUT---\n", .{}); | |
} | |
// Minimal test runner, can be expanded. | |
test "basic heading and paragraph" { | |
var test_allocator = std.testing.allocator; | |
const md = "# Test\n\nA paragraph."; | |
var root = try parseMarkdown(test_allocator, md); | |
defer root.deinit(); | |
try std.testing.expectEqual(root.type, NodeType.Document); | |
try std.testing.expectEqual(root.children.items.len, 2); | |
const heading = root.children.items[0]; | |
try std.testing.expectEqual(heading.type, NodeType.Heading); | |
try std.testing.expectEqual(heading.level.?, 1); | |
try std.testing.expectEqual(heading.children.items.len, 1); // Text node "Test" | |
try std.testing.expectEqualSlices(u8, heading.children.items[0].content.?, "Test"); | |
const para = root.children.items[1]; | |
try std.testing.expectEqual(para.type, NodeType.Paragraph); | |
try std.testing.expectEqual(para.children.items.len, 1); // Text node "A paragraph." | |
try std.testing.expectEqualSlices(u8, para.children.items[0].content.?, "A paragraph."); | |
} | |
test "inline strong and emphasis" { | |
var test_allocator = std.testing.allocator; | |
const md = "**bold** *italic*"; | |
var root = try parseMarkdown(test_allocator, md); | |
defer root.deinit(); | |
try std.testing.expectEqual(root.type, NodeType.Document); | |
try std.testing.expectEqual(root.children.items.len, 1); // Paragraph | |
const para = root.children.items[0]; | |
try std.testing.expectEqual(para.children.items.len, 3); // Strong, Text(" "), Emphasis | |
const strong_node = para.children.items[0]; | |
try std.testing.expectEqual(strong_node.type, NodeType.Strong); | |
try std.testing.expectEqual(strong_node.children.items[0].content.?, "bold"); | |
const space_node = para.children.items[1]; | |
try std.testing.expectEqual(space_node.type, NodeType.Text); | |
try std.testing.expectEqual(space_node.content.?, " "); | |
const em_node = para.children.items[2]; | |
try std.testing.expectEqual(em_node.type, NodeType.Emphasis); | |
try std.testing.expectEqual(em_node.children.items[0].content.?, "italic"); | |
} | |
test "fenced code block" { | |
var test_allocator = std.testing.allocator; | |
const md = "```rust\nlet x = 10;\n```"; | |
var root = try parseMarkdown(test_allocator, md); | |
defer root.deinit(); | |
try std.testing.expectEqual(root.children.items.len, 1); | |
const fcb = root.children.items[0]; | |
try std.testing.expectEqual(fcb.type, NodeType.FencedCodeBlock); | |
try std.testing.expectEqualSlices(u8, fcb.info_string.?, "rust"); | |
try std.testing.expectEqualSlices(u8, fcb.content.?, "let x = 10;\n"); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment