Created
April 9, 2021 21:19
-
-
Save islem-esi/6e7b679e0eb443bf35c96425fd1aeaaf to your computer and use it in GitHub Desktop.
example for summarization models
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sumy | |
from sumy.nlp.tokenizers import Tokenizer | |
from sumy.parsers.plaintext import PlaintextParser | |
#LSA algorithm | |
from sumy.summarizers.lsa import LsaSummarizer | |
#text: text to summarize | |
#no_sentences: number of sentences in your summary, | |
#lang: language of text | |
def lsa_summary(text, no_sentences, lang): | |
parser = PlaintextParser.from_string(text, Tokenizer(lang)) | |
lsa_sum = LsaSummarizer() | |
summary = lsa_sum(parser.document, no_sentences) | |
gc.collect() | |
return [str(sentence) for sentence in summary] | |
#Luhn | |
from sumy.summarizers.luhn import LuhnSummarizer | |
#text: text to summarize | |
#no_sentences: number of sentences in your summary, | |
#lang: language of text | |
def luhn_summary(text, no_sentences, lang): | |
parser = PlaintextParser(text, Tokenizer(lang)) | |
luhn_sum = LuhnSummarizer() | |
summary = luhn_sum(parser.document, no_sentences) | |
gc.collect() | |
return [str(sentence) for sentence in summary] | |
#LexRank | |
from sumy.summarizers.lex_rank import LexRankSummarizer | |
#text: text to summarize | |
#no_sentences: number of sentences in your summary, | |
#lang: language of text | |
def lex_summary(text, no_sentences, lang): | |
parser = PlaintextParser.from_string(text,Tokenizer(lang)) | |
lex_sum = LexRankSummarizer() | |
summary = lex_sum(parser.document, no_sentences) | |
gc.collect() | |
return[str(sentence) for sentence in summary] | |
#KL | |
from sumy.summarizers.kl import KLSummarizer | |
#text: text to summarize | |
#no_sentences: number of sentences in your summary, | |
#lang: language of text | |
def kl_summary(text, no_sentences, lang): | |
parser = PlaintextParser.from_string(text,Tokenizer(lang)) | |
kl_summarizer=KLSummarizer() | |
summary=kl_summarizer(parser.document,sentences_count=no_sentences) | |
gc.collect() | |
return [str(sentence) for sentence in summary] | |
#Transformers T5 | |
from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration | |
#text: text to summarize | |
#model: t5-base, t5-small, t5-large, t5-3b, t5-11b | |
def t5_summary(text, model): | |
my_model = T5ForConditionalGeneration.from_pretrained(model) | |
tokenizer = T5Tokenizer.from_pretrained(model) | |
input_ids=tokenizer.encode("summarize:"+text, return_tensors='pt', max_length = 512, truncation=True) | |
summary_ids = my_model.generate(input_ids) | |
t5_sum = tokenizer.decode(summary_ids[0]) | |
gc.collect() | |
return(str(t5_sum)) | |
#BART | |
from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig | |
#text: text to summarize | |
#model: bart-base, bart-large, bart-large-cnn | |
def bart_summary(text, model): | |
tokenizer=BartTokenizer.from_pretrained('facebook/'+str(model)) | |
model=BartForConditionalGeneration.from_pretrained('facebook/'+str(model)) | |
inputs = tokenizer.batch_encode_plus(text,return_tensors='pt', padding=True, truncation=True) | |
summary_ids = model.generate(inputs['input_ids'], early_stopping=True) | |
bart_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
gc.collect() | |
return(str(bart_summary)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment