Skip to content

Instantly share code, notes, and snippets.

@nabeel-shakeel
Forked from hamelsmu/html2md.py
Created November 26, 2024 06:08
Show Gist options
  • Save nabeel-shakeel/bc7d32f719aa9cc1198fefdbea15c042 to your computer and use it in GitHub Desktop.
Save nabeel-shakeel/bc7d32f719aa9cc1198fefdbea15c042 to your computer and use it in GitHub Desktop.
html to markdown
from html2text import HTML2Text
from textwrap import dedent
from trafilatura import extract
import re
def get_md(cts, extractor='h2t'):
h2t = HTML2Text(bodywidth=5000)
h2t.ignore_links = True
h2t.mark_code = True
h2t.ignore_images = True
res = h2t.handle(cts)
def _f(m): return f'```\n{dedent(m.group(1))}\n```'
return re.sub(r'\[code]\s*\n(.*?)\n\[/code]', _f, res or '', flags=re.DOTALL).strip()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment