pip install markdown imgkit
You'll also need to install wkhtmltopdf
on your system:
On Ubuntu/Debian: sudo apt-get install wkhtmltopdf
On macOS: brew install wkhtmltopdf
On Windows: Download the installer from the wkhtmltopdf website.
pip install markdown imgkit
You'll also need to install wkhtmltopdf
on your system:
On Ubuntu/Debian: sudo apt-get install wkhtmltopdf
On macOS: brew install wkhtmltopdf
On Windows: Download the installer from the wkhtmltopdf website.
#!/usr/bin/env python3 | |
"""Convert TMX files to ParaConc format. | |
This script converts TMX (Translation Memory eXchange) files to ParaConc format, | |
which consists of three separate XML files: source language, target language, | |
and alignment information. It supports complex alignment patterns, HTML tag preservation, | |
and includes input validation. | |
Example usage: | |
python tmx_to_paraconc.py input.tmx -o output_prefix |
import os | |
import torch | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
# Load pre-trained model for sentence embeddings | |
model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2") | |
# Set up LSTM model | |
input_size = 768 # Size of the sentence embeddings |
import json | |
import argparse | |
from typing import Dict | |
from pathlib import Path | |
import smart_open | |
import ftfy | |
from tqdm import tqdm | |
import html2text | |
from datasets import load_dataset |
import argparse | |
from flair.data import Sentence | |
from flair.embeddings import ( | |
DocumentEmbeddings, | |
FlairEmbeddings, | |
DocumentLMEmbeddings, | |
DocumentPoolEmbeddings, | |
) | |
from torch import Tensor |
#!/bin/bash | |
# You will need `apt get parallel pv` to make it run | |
# download file containing urls | |
curl http://webdatacommons.org/structureddata/2022-12/files/file.list > urls.txt | |
# create output file | |
touch output.txt |
import bz2 | |
import logging | |
import multiprocessing | |
import re | |
from pickle import PicklingError | |
# LXML isn't faster, so let's go with the built-in solution | |
from xml.etree.ElementTree import iterparse |
# pip install pymorphy3 | |
# pip install pymorphy3-dicts-uk | |
import pymorphy3 | |
from collections import defaultdict | |
from itertools import product | |
from typing import List, List | |
morph = pymorphy3.MorphAnalyzer(lang="uk") |
import os.path | |
from flair.data import Dictionary | |
from flair.models import LanguageModel | |
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus | |
def train_flair_embeddings( | |
corpus_path="/data/ubertext/for_flair", | |
dictionary_path="/home/dima/Projects/flair_embeddings/flair_dictionary.pkl", | |
lm_file="./language_model_forward_no_amp_accum_grad_fixed", |
import wn | |
import csv | |
from collections import Counter, defaultdict | |
from tqdm.notebook import tqdm | |
wn.download("pwn:3.1") | |
pwn = wn.Wordnet("pwn:3.1") |