Last active
January 31, 2025 07:58
-
-
Save rjurney/1100f6ed9dc86b420dcff7e9ddcdfcb6 to your computer and use it in GitHub Desktop.
Relik for relation extraction on the GraphFrames paper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Script that tests and times Relik's relation extraction and entity linking on the GraphFrames Paper: https://people.eecs.berkeley.edu/~matei/papers/2016/grades_graphframes.pdf""" | |
import timeit | |
import warnings | |
from pprint import pprint | |
from relik import Relik # type: ignore | |
from relik.inference.data.objects import RelikOutput # type: ignore | |
# Squash Relik's warnings for prettier screenshots | |
warnings.simplefilter("ignore") | |
# Load the GraphFrames' paper text | |
with open("data/grades_graphframes.txt") as f: | |
paper: str = f.read() | |
# Load the | |
relik = Relik.from_pretrained("sapienzanlp/relik-relation-extraction-nyt-large") | |
relik_out: RelikOutput = relik(paper) | |
# Have a looksee! | |
relik_out.triplets | |
"""Script that tests Relik's relation extraction and entity linking on the GraphFrames Paper: https://people.eecs.berkeley.edu/~matei/papers/2016/grades_graphframes.pdf""" | |
# Load the model from Huggingface using Facebook Hydra | |
relik = Relik.from_pretrained("sapienzanlp/relik-relation-extraction-nyt-large") | |
def do_relik() -> RelikOutput: | |
"""Run Relik on the GraphFrames paper and pretty print the triplets it extracts.""" | |
relik_out: RelikOutput = relik(paper) | |
# Check out the triplets | |
pprint(relik_out.triplets, indent=4, depth=2, sort_dicts=True) | |
# Time the inference on the paper | |
elapsed_time = timeit.timeit("do_relik()", globals=globals(), number=1) | |
print(f"Relik relation extraction time for {len(paper):,} characters: {elapsed_time:.6f} seconds") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
___ __ | |
/\_ \ __ /\ \ | |
_ __ __ \//\ \ /\_\ \ \ \/'\ | |
/\`'__\ /'__`\ \ \ \ \/\ \ \ \ , < | |
\ \ \/ /\ __/ \_\ \_ \ \ \ \ \ \\`\ | |
\ \_\ \ \____\ /\____\ \ \_\ \ \_\ \_\ | |
\/_/ \/____/ \/____/ \/_/ \/_/\/_/ | |
[2025-01-30 23:56:30,111] [INFO] [relik.inference.annotator.from_pretrained:700] [PID:3289432] [RANK:0] Loading Relik from sapienzanlp/relik-relation-extraction-nyt-large | |
[2025-01-30 23:56:30,112] [INFO] [relik.inference.annotator.from_pretrained:701] [PID:3289432] [RANK:0] { | |
'_target_': 'relik.inference.annotator.Relik', | |
'index': { | |
'triplet': { | |
'_target_': 'relik.retriever.indexers.inmemory.InMemoryDocumentIndex.from_pretrained', | |
'name_or_path': 'sapienzanlp/relik-retriever-small-nyt-document-index', | |
}, | |
}, | |
'metadata_fields': [], | |
'reader': { | |
'_target_': 'relik.reader.pytorch_modules.triplet.RelikReaderForTripletExtraction', | |
'transformer_model': 'sapienzanlp/relik-reader-deberta-v3-large-nyt', | |
}, | |
'retriever': { | |
'triplet': { | |
'_target_': 'relik.retriever.pytorch_modules.model.GoldenRetriever', | |
'question_encoder': 'sapienzanlp/relik-retriever-small-nyt-question-encoder', | |
}, | |
}, | |
'task': 'TRIPLET', | |
'top_k': 24, | |
'window_size': 'sentence', | |
'window_stride': None, | |
} | |
[2025-01-30 23:56:30,884] [INFO] [relik.retriever.indexers.base.from_pretrained:484] [PID:3289432] [RANK:0] Loading Index from config: | |
[2025-01-30 23:56:30,884] [INFO] [relik.retriever.indexers.base.from_pretrained:485] [PID:3289432] [RANK:0] { | |
'_target_': 'relik.retriever.indexers.inmemory.InMemoryDocumentIndex', | |
'device': 'cpu', | |
'metadata_fields': ['definition'], | |
'name_or_path': None, | |
'precision': None, | |
'separator': ' <def> ', | |
'use_faiss': False, | |
} | |
[2025-01-30 23:56:30,884] [INFO] [relik.retriever.indexers.base.from_pretrained:492] [PID:3289432] [RANK:0] Loading documents from /home/rjurney/.cache/huggingface/hub/models--sapienzanlp--relik-retriever-small-nyt-document-index/snapshots/7507014d674a84d622649acb1a8e0a7883bfec3f/documents.jsonl | |
[2025-01-30 23:56:30,885] [INFO] [relik.retriever.indexers.base.from_pretrained:535] [PID:3289432] [RANK:0] Loading embeddings from /home/rjurney/.cache/huggingface/hub/models--sapienzanlp--relik-retriever-small-nyt-document-index/snapshots/7507014d674a84d622649acb1a8e0a7883bfec3f/embeddings.pt | |
[2025-01-30 23:56:30,886] [INFO] [relik.retriever.indexers.inmemory.__init__:65] [PID:3289432] [RANK:0] Both documents and embeddings are provided. | |
[2025-01-30 23:57:35,999] [DEBUG] [relik.reader.data.relik_reader_re_data.__iter__:399] [PID:3289432] [RANK:0] Dataset finished: 328 number of elements processed | |
[ Triplets(subject=Span(start=677, end=683, label='--NME--', text='GraphX'), label='company', object=Span(start=654, end=659, label='--NME--', text='RDBMS'), confidence=0.9800000190734863), | |
Triplets(subject=Span(start=5167, end=5172, label='--NME--', text='Spark'), label='company location', object=Span(start=5156, end=5162, label='--NME--', text='Python'), confidence=1.0), | |
Triplets(subject=Span(start=7841, end=7846, label='--NME--', text='Spark'), label='company location', object=Span(start=7781, end=7787, label='--NME--', text='Python'), confidence=1.0), | |
Triplets(subject=Span(start=8493, end=8498, label='--NME--', text='Spark'), label='company location', object=Span(start=8481, end=8487, label='--NME--', text='Python'), confidence=1.0), | |
Triplets(subject=Span(start=12187, end=12203, label='--NME--', text='Spark DataFrames'), label='company location', object=Span(start=12227, end=12233, label='--NME--', text='Python'), confidence=0.9900000095367432), | |
Triplets(subject=Span(start=12687, end=12691, label='--NME--', text='Solr'), label='company location', object=Span(start=12905, end=12911, label='--NME--', text='Amazon'), confidence=0.9700000286102295), | |
Triplets(subject=Span(start=25259, end=25264, label='--NME--', text='Scala'), label='contains', object=Span(start=25152, end=25162, label='--NME--', text='GraphFrame'), confidence=1.0), | |
Triplets(subject=Span(start=32492, end=32498, label='--NME--', text='Pregel'), label='company location', object=Span(start=32503, end=32511, label='--NME--', text='GraphLab'), confidence=1.0), | |
Triplets(subject=Span(start=36661, end=36693, label='--NME--', text='SIGMOD (2010).\n[13] McA uley, J.'), label='company', object=Span(start=36661, end=36667, label='--NME--', text='SIGMOD'), confidence=0.9900000095367432), | |
Triplets(subject=Span(start=36691, end=36693, label='--NME--', text='J.'), label='company', object=Span(start=36661, end=36667, label='--NME--', text='SIGMOD'), confidence=1.0), | |
Triplets(subject=Span(start=36691, end=36693, label='--NME--', text='J.'), label='company', object=Span(start=36661, end=36689, label='--NME--', text='SIGMOD (2010).\n[13] McA uley'), confidence=1.0), | |
Triplets(subject=Span(start=36691, end=36693, label='--NME--', text='J.'), label='company', object=Span(start=36661, end=36693, label='--NME--', text='SIGMOD (2010).\n[13] McA uley, J.'), confidence=0.949999988079071), | |
Triplets(subject=Span(start=36943, end=36948, label='--NME--', text='M. A.'), label='company', object=Span(start=36953, end=36960, label='--NME--', text='Gremlin'), confidence=1.0), | |
Triplets(subject=Span(start=38539, end=38544, label='--NME--', text='Huang'), label='company', object=Span(start=38469, end=38479, label='--NME--', text='GraphFrame'), confidence=1.0)] | |
Relik relation extraction time for 48,155 characters: 62.961591 seconds |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment