Last active
May 9, 2021 05:08
-
-
Save andymatuschak/35be2a2041eda6773347a61ce75cb641 to your computer and use it in GitHub Desktop.
parsing SRS prompts from Markdown
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mdast from "mdast"; | |
import remarkParse from "remark-parse"; | |
import remarkStringify from "remark-stringify"; | |
import unified from "unified"; | |
import unist from "unist"; | |
import { clozeNodeType, ClozePromptNode } from "./index"; | |
// TODO: don't match clozes inside code and html blocks | |
const clozeRegexp = /^{(.+?)}/; | |
export default function clozePlugin(this: unified.Processor) { | |
function clozeTokenizer( | |
this: remarkParse.Parser & { | |
tokenizeInline: ( | |
content: string, | |
now: { | |
line: number; | |
column: number; | |
offset: number; | |
} | |
) => mdast.PhrasingContent[]; | |
}, | |
eat: remarkParse.Eat & { | |
now: () => { | |
line: number; | |
column: number; | |
offset: number; | |
}; | |
}, | |
value: string | |
) { | |
const match = clozeRegexp.exec(value); | |
if (match) { | |
const now = eat.now(); | |
now.column += 1; | |
now.offset += 1; | |
const children = this.tokenizeInline(match[1], now); | |
const output: ClozePromptNode = { | |
type: clozeNodeType, | |
children | |
}; | |
return eat(match[0])(output); | |
} | |
} | |
clozeTokenizer.locator = (value: string, fromIndex: number) => { | |
return value.indexOf("{", fromIndex); | |
}; | |
const parserPrototype = this.Parser.prototype as remarkParse.Parser; | |
parserPrototype.inlineTokenizers.clozePrompt = clozeTokenizer as remarkParse.Tokenizer; | |
parserPrototype.inlineMethods.splice( | |
parserPrototype.inlineMethods.indexOf("text"), | |
0, | |
"clozePrompt" | |
); | |
const compilerPrototype = this.Compiler.prototype as remarkStringify.Compiler; | |
compilerPrototype.visitors[clozeNodeType] = clozePromptCompiler as ( | |
node: unist.Node | |
) => string; | |
} | |
function clozePromptCompiler( | |
this: remarkStringify.Compiler & { | |
all: (node: unist.Node) => string[]; | |
}, | |
node: ClozePromptNode | |
): string { | |
const content = this.all(node).join(""); | |
return `{${content}}`; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unist from "unist"; | |
import mdast from "mdast"; | |
import parents, { NodeWithParent } from "unist-util-parents"; | |
import { selectAll } from "unist-util-select"; | |
import { backlinksNodeType } from "../backlinksPlugin"; | |
import { JsonMap } from "../util/JSONTypes"; | |
export const clozeNodeType = "incremental-thinking-cloze"; | |
export interface ClozePromptNode extends unist.Node { | |
type: typeof clozeNodeType; | |
children: mdast.PhrasingContent[]; | |
} | |
export const clozePromptType = "cloze"; | |
export interface ClozePrompt extends JsonMap { | |
type: typeof clozePromptType; | |
block: mdast.BlockContent & JsonMap; // Except note that PhrasingContent can include type ClozePromptNode. | |
} | |
export const qaPromptNodeType = "incremental-thinking-QA"; | |
export interface QAPromptNode extends unist.Node { | |
type: typeof qaPromptNodeType; | |
question: mdast.Parent; | |
answer: mdast.Parent; | |
} | |
export const qaPromptType = "qaPrompt"; | |
export interface QAPrompt extends JsonMap { | |
type: typeof qaPromptType; | |
question: mdast.Parent & JsonMap; | |
answer: mdast.Parent & JsonMap; | |
} | |
export type Prompt = ClozePrompt | QAPrompt; | |
export function findAllPrompts(tree: unist.Node): Prompt[] { | |
const treeWithParents = parents(tree); | |
const clozeNodes = selectAll( | |
clozeNodeType, | |
treeWithParents | |
) as NodeWithParent[]; | |
const clozePrompts: ClozePrompt[] = []; | |
const visitedClozePromptBlocks: Set<mdast.BlockContent> = new Set(); | |
for (const node of clozeNodes) { | |
let parent: NodeWithParent | null = node.parent; | |
while (parent && !isBlockContent(parent)) { | |
parent = parent.parent; | |
} | |
if ( | |
parent && | |
!promptNodeHasUnsupportedParent(node) && | |
!visitedClozePromptBlocks.has(parent) | |
) { | |
visitedClozePromptBlocks.add(parent); | |
clozePrompts.push({ | |
type: "cloze", | |
block: parent as mdast.BlockContent & JsonMap | |
}); | |
} | |
} | |
const qaPrompts = selectAll(qaPromptNodeType, treeWithParents) | |
.filter(n => !promptNodeHasUnsupportedParent(n as NodeWithParent)) | |
.map(n => { | |
const qaPromptNode = n as QAPromptNode; | |
const qaPrompt: QAPrompt = { | |
type: "qaPrompt", | |
question: qaPromptNode.question as mdast.Parent & JsonMap, | |
answer: qaPromptNode.answer as mdast.Parent & JsonMap | |
}; | |
return qaPrompt; | |
}); | |
return (clozePrompts as Prompt[]).concat(qaPrompts); | |
} | |
export function getClozeNodesInClozePrompt( | |
clozePrompt: ClozePrompt | |
): ClozePromptNode[] { | |
return selectAll(clozeNodeType, clozePrompt.block) as ClozePromptNode[]; | |
} | |
function promptNodeHasUnsupportedParent(promptNode: NodeWithParent): boolean { | |
let node = promptNode.parent; | |
while (node) { | |
if (node.type === backlinksNodeType) { | |
return true; | |
} | |
node = node.parent; | |
} | |
return false; | |
} | |
const blockTypes = new Set([ | |
"paragraph", | |
"heading", | |
"thematicBreak", | |
"blockquote", | |
"list", | |
"table", | |
"html", | |
"code" | |
]); | |
function isBlockContent(node: unist.Node): node is mdast.BlockContent { | |
return blockTypes.has(node.type); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mdast from "mdast"; | |
import remarkStringify from "remark-stringify"; | |
import unified from "unified"; | |
import unist from "unist"; | |
import parents, { NodeWithParent } from "unist-util-parents"; | |
import { selectAll } from "unist-util-select"; | |
import { QAPromptNode, qaPromptNodeType } from "./index"; | |
// TODO: don't match QA prompts inside code and html blocks | |
export default function qaPromptPlugin(this: unified.Processor) { | |
const compilerPrototype = this.Compiler.prototype as remarkStringify.Compiler; | |
compilerPrototype.visitors[qaPromptNodeType] = qaPromptCompiler as ( | |
node: unist.Node | |
) => string; | |
return extractQAPromptNodes; | |
} | |
function qaPromptCompiler( | |
this: remarkStringify.Compiler & { | |
all: (node: unist.Node) => string[]; | |
}, | |
node: QAPromptNode | |
): string { | |
throw new Error("Unimplemented"); | |
} | |
const questionPrefix = "Q. "; | |
const answerPrefix = "A. "; | |
const answerSplitRegexp = new RegExp(`\n${answerPrefix}`, "m"); | |
function extractQAPromptNodes(node: unist.Node): unist.Node { | |
const nodeWithParents = parents(node); | |
const answerNodes = selectAll( | |
`paragraph>text[value^='${answerPrefix}']`, | |
nodeWithParents | |
) as NodeWithParent[]; | |
for (const answerNode of answerNodes) { | |
const parent = answerNode.parent!.parent!.node; | |
const answerParagraphIndex = parent.children.indexOf( | |
answerNode.parent!.node | |
); | |
if (answerParagraphIndex === -1 || answerParagraphIndex === 0) { | |
throw new Error( | |
`Unexpected QA prompt answer node: ${JSON.stringify( | |
answerNode, | |
null, | |
"\t" | |
)}` | |
); | |
} | |
const questionParagraphNode = parent.children[ | |
answerParagraphIndex - 1 | |
] as mdast.Paragraph; | |
if (questionParagraphNode.type === "paragraph") { | |
const questionTextNode = questionParagraphNode.children[0] as mdast.Text; | |
if ( | |
questionParagraphNode.children.length === 1 && | |
questionTextNode.type === "text" | |
) { | |
if (questionTextNode.value.startsWith(questionPrefix)) { | |
// Now we'll strip the prefixes off. | |
const answerParagraphNode = parent.children[ | |
answerParagraphIndex | |
] as mdast.Paragraph; | |
questionTextNode.value = questionTextNode.value.slice( | |
questionPrefix.length | |
); | |
const answerTextNode = answerParagraphNode.children[0] as mdast.Text; | |
answerTextNode.value = answerTextNode.value.slice( | |
answerPrefix.length | |
); | |
const qaPromptNode: QAPromptNode = { | |
type: qaPromptNodeType, | |
question: questionParagraphNode, | |
answer: answerParagraphNode | |
}; | |
parent.children.splice(answerParagraphIndex - 1, 2, qaPromptNode); | |
} | |
} | |
} | |
} | |
const questionNodes = selectAll( | |
`paragraph>text[value^='${questionPrefix}']`, | |
nodeWithParents | |
) as NodeWithParent[]; | |
for (const questionNode of questionNodes) { | |
const paragraphNode = questionNode.parent!.node as mdast.Paragraph; | |
const splitNodeIndex = paragraphNode.children.findIndex( | |
node => | |
node.type === "text" && | |
answerSplitRegexp.test((node as mdast.Text).value) | |
); | |
if (splitNodeIndex === -1) { | |
continue; | |
} | |
const splitNode = paragraphNode.children[splitNodeIndex] as mdast.Text; | |
const match = splitNode.value.match(answerSplitRegexp)!; | |
const preSplitString = splitNode.value.slice(0, match.index!); | |
const postSplitString = splitNode.value.slice(match.index!); | |
let questionPhrasingNodes = paragraphNode.children.slice(0, splitNodeIndex); | |
let answerPhrasingNodes = paragraphNode.children.slice(splitNodeIndex); | |
if (preSplitString !== "") { | |
// We've gotta split that node. | |
questionPhrasingNodes.push({ | |
type: "text", | |
value: preSplitString | |
}); | |
answerPhrasingNodes[0].value = postSplitString; | |
} | |
(questionPhrasingNodes[0] as mdast.Text).value = (questionPhrasingNodes[0] as mdast.Text).value.slice( | |
questionPrefix.length | |
); | |
(answerPhrasingNodes[0] as mdast.Text).value = (answerPhrasingNodes[0] as mdast.Text).value.slice( | |
answerPrefix.length + 1 // add 1 for the newline | |
); | |
const qaPromptNode: QAPromptNode = { | |
type: qaPromptNodeType, | |
question: { type: "paragraph", children: questionPhrasingNodes }, | |
answer: { type: "paragraph", children: answerPhrasingNodes } | |
}; | |
const paragraphContainer = questionNode.parent!.parent! | |
.node as unist.Parent; | |
paragraphContainer.children.splice( | |
paragraphContainer.children.indexOf(paragraphNode), | |
1, | |
qaPromptNode | |
); | |
} | |
return node; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment