Skip to content

Instantly share code, notes, and snippets.

View luistung's full-sized avatar
🎯
Focusing

Luis Tung luistung

🎯
Focusing
  • Hangzhou, China
View GitHub Profile
@luistung
luistung / decorator.rkt
Last active March 16, 2025 16:14
decorator like python in racket
#lang racket
(define-syntax (define-decorator stx)
(syntax-case stx ()
[(_ (dector EXEC_RESULT) body ...)
#`(define (dector fun)
(make-keyword-procedure
(lambda (kws kw-args . args)
(let-syntax ([EXEC_RESULT
(lambda (stx)
@luistung
luistung / interact_with_python.py
Created February 22, 2025 15:15
how to control with other interactive applications
import pexpect
import re
def interact_with_repl():
child = pexpect.spawn('python3')
child.expect(['>>> '])
expect_ret = '>>> '
while True:
@luistung
luistung / streamed_thread_pool_executor.py
Created November 16, 2024 09:04
this code implements a parallel task streaming executor using thread pools
from typing import TypeVar, Callable, Iterable, Tuple, Iterator
import concurrent.futures
import time
import random
T = TypeVar('T')
R = TypeVar('R')
def stream_parallel_tasks(
task_fun: Callable[[T], R],
@luistung
luistung / fmt-quickscript.rkt
Last active November 29, 2024 16:40 — forked from Metaxal/fmt-quickscript.rkt
Quickscript for sorawee's `fmt`
#lang racket/base
(require quickscript
fmt ; needs to be installed first
racket/class
racket/set
racket/list)
;;; Author: Laurent Orseau
;;; License: [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) or
;;; [MIT license](http://opensource.org/licenses/MIT) at your option.
@luistung
luistung / llm_ft.py
Last active April 28, 2024 09:53
finetune llm example
from transformers import AutoTokenizer
from datasets import Dataset
import torch
import pandas as pd
import numpy as np
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token # 设置填充符号
qa_pairs = [
@luistung
luistung / continue_pretrain.py
Created April 28, 2024 09:45
continue pretrain example using hugging face
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
# 选择模型,这里可以替换为任何 transformers 支持的模型,如 "bert-base-uncased", "gpt2" 等
model_name = "gpt2"
device = torch.device("cpu")
# 加载模型和分词器
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

Keybase proof

I hereby claim:

  • I am luistung on github.
  • I am luistung (https://keybase.io/luistung) on keybase.
  • I have a public key ASCcsP70NQH1pe_YHkb_VWNteyojKMnNa4gSEpNRxAF3_Qo

To claim this, I am signing this object:

@luistung
luistung / CMakeLists.txt
Created June 11, 2021 02:58
pytorch to c++
cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
project(custom_ops)
find_package(Torch REQUIRED)
add_executable(example-app example-app.cpp)
target_link_libraries(example-app "${TORCH_LIBRARIES}")
set_property(TARGET example-app PROPERTY CXX_STANDARD 14)
@luistung
luistung / tokenization.cpp
Last active May 30, 2024 03:15
c++ version of bert tokenize
/* c++ version of tokenization for bert
Copyright (C) 2019 luistung
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
@luistung
luistung / tokenization.cpp
Created October 11, 2019 12:02
c++ version of bert tokenize
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <unordered_map>
#include <boost/algorithm/string.hpp>
#include <utf8proc.h>
//https://unicode.org/reports/tr15/#Norm_Forms
//https://ssl.icu-project.org/apiref/icu4c/uchar_8h.html