WhisperForConditionalGeneration(
(model): WhisperModel(
(encoder): WhisperEncoder(
(conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
(conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
(embed_positions): Embedding(1500, 1280)
(layers): ModuleList(
(0-31): 32 x WhisperEncoderLayer(
(self_attn): WhisperSdpaAttention(
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import evaluate | |
import polars as pl | |
import time | |
import torch | |
from tqdm import tqdm | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from datasets import load_dataset | |
# --- 1. SETUP --- | |
model_id = '/home/smlkw/en-uk-t/final-checkpoints/kulyk-en-uk' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from safetensors.torch import load_file, save_file | |
import os | |
# --- Configuration --- | |
# Specify the path to your input .safetensors file | |
input_filepath = "model.safetensors" | |
# Specify the path for the new BF16 output file | |
output_filepath = "model_bf16.safetensors" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from datasets import load_dataset | |
from rensa import CMinHash, RMinHash | |
from tqdm import tqdm | |
COLUMN = "source" | |
SPLIT = "train" | |
ALGORITHM = "CMinHash" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from glob import glob | |
from os.path import basename | |
files_all = glob("data/*.mp3") | |
results = [] | |
for idx, filename in enumerate(files_all): | |
duration = 0 | |
results.append({'file_name': basename(filename), 'duration': duration, 'transcription': '-'}) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fn q_rsqrt(number: f32) -> f32 { | |
let threehalfs: f32 = 1.5; | |
let x2: f32 = number * 0.5; | |
let mut y: f32 = number; | |
let i: u32 = y.to_bits(); // safely get the bit representation of the float | |
let i: u32 = 0x5f3759df - (i >> 1); // what the heck? | |
y = f32::from_bits(i); // safely convert bits back to float | |
y * (threehalfs - (x2 * y * y)) // 1st iteration |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torchaudio | |
from speechbrain.pretrained import VAD | |
VAD = VAD.from_hparams(source="speechbrain/vad-crdnn-libriparty", savedir="pretrained_models/vad-crdnn-libriparty") | |
test_file = 'a.wav' | |
boundaries = VAD.get_speech_segments(test_file) | |
segments = VAD.get_segments(boundaries, test_file) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Python implementation of Viterbi algorithm for word segmentation | |
A clean-up of this: http://norvig.com/ngrams/ch14.pdf | |
- | |
You also need 'unigrams.txt' and 'bigrams.txt' to run the segmentation. The ngrams | |
used in this implementation is from the 'count_1w.txt' and 'count_2w.txt' provided | |
here: http://norvig.com/ngrams/ | |
- | |
Usage: | |
>>> from segment import viterbi |
- GPU: Tesla v100
- Ubuntu 18.04
apt update
apt install cmake gcc-7 liblzma-dev libbz2-dev
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-repo-ubuntu1804_10.0.130-1_amd64.deb
dpkg -i cuda-repo-ubuntu1804_10.0.130-1_amd64.deb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Generator of words forms for LinguisticAndInformationSystems/mphdict | |
Source code: https://github.com/LinguisticAndInformationSystems/mphdict/blob/master/src/mphdict/mphDb.cs#L214 | |
License: https://github.com/LinguisticAndInformationSystems/mphdict/blob/master/LICENSE.txt | |
Copyright: uSofTrod | |
Output is like the following: |
NewerOlder