Skip to content

Instantly share code, notes, and snippets.

@rjurney
Last active January 31, 2025 07:58
Show Gist options
  • Save rjurney/1100f6ed9dc86b420dcff7e9ddcdfcb6 to your computer and use it in GitHub Desktop.
Save rjurney/1100f6ed9dc86b420dcff7e9ddcdfcb6 to your computer and use it in GitHub Desktop.
Relik for relation extraction on the GraphFrames paper
"""Script that tests and times Relik's relation extraction and entity linking on the GraphFrames Paper: https://people.eecs.berkeley.edu/~matei/papers/2016/grades_graphframes.pdf"""
import timeit
import warnings
from pprint import pprint
from relik import Relik # type: ignore
from relik.inference.data.objects import RelikOutput # type: ignore
# Squash Relik's warnings for prettier screenshots
warnings.simplefilter("ignore")
# Load the GraphFrames' paper text
with open("data/grades_graphframes.txt") as f:
paper: str = f.read()
# Load the
relik = Relik.from_pretrained("sapienzanlp/relik-relation-extraction-nyt-large")
relik_out: RelikOutput = relik(paper)
# Have a looksee!
relik_out.triplets
"""Script that tests Relik's relation extraction and entity linking on the GraphFrames Paper: https://people.eecs.berkeley.edu/~matei/papers/2016/grades_graphframes.pdf"""
# Load the model from Huggingface using Facebook Hydra
relik = Relik.from_pretrained("sapienzanlp/relik-relation-extraction-nyt-large")
def do_relik() -> RelikOutput:
"""Run Relik on the GraphFrames paper and pretty print the triplets it extracts."""
relik_out: RelikOutput = relik(paper)
# Check out the triplets
pprint(relik_out.triplets, indent=4, depth=2, sort_dicts=True)
# Time the inference on the paper
elapsed_time = timeit.timeit("do_relik()", globals=globals(), number=1)
print(f"Relik relation extraction time for {len(paper):,} characters: {elapsed_time:.6f} seconds")
___ __
/\_ \ __ /\ \
_ __ __ \//\ \ /\_\ \ \ \/'\
/\`'__\ /'__`\ \ \ \ \/\ \ \ \ , <
\ \ \/ /\ __/ \_\ \_ \ \ \ \ \ \\`\
\ \_\ \ \____\ /\____\ \ \_\ \ \_\ \_\
\/_/ \/____/ \/____/ \/_/ \/_/\/_/
[2025-01-30 23:56:30,111] [INFO] [relik.inference.annotator.from_pretrained:700] [PID:3289432] [RANK:0] Loading Relik from sapienzanlp/relik-relation-extraction-nyt-large
[2025-01-30 23:56:30,112] [INFO] [relik.inference.annotator.from_pretrained:701] [PID:3289432] [RANK:0] {
'_target_': 'relik.inference.annotator.Relik',
'index': {
'triplet': {
'_target_': 'relik.retriever.indexers.inmemory.InMemoryDocumentIndex.from_pretrained',
'name_or_path': 'sapienzanlp/relik-retriever-small-nyt-document-index',
},
},
'metadata_fields': [],
'reader': {
'_target_': 'relik.reader.pytorch_modules.triplet.RelikReaderForTripletExtraction',
'transformer_model': 'sapienzanlp/relik-reader-deberta-v3-large-nyt',
},
'retriever': {
'triplet': {
'_target_': 'relik.retriever.pytorch_modules.model.GoldenRetriever',
'question_encoder': 'sapienzanlp/relik-retriever-small-nyt-question-encoder',
},
},
'task': 'TRIPLET',
'top_k': 24,
'window_size': 'sentence',
'window_stride': None,
}
[2025-01-30 23:56:30,884] [INFO] [relik.retriever.indexers.base.from_pretrained:484] [PID:3289432] [RANK:0] Loading Index from config:
[2025-01-30 23:56:30,884] [INFO] [relik.retriever.indexers.base.from_pretrained:485] [PID:3289432] [RANK:0] {
'_target_': 'relik.retriever.indexers.inmemory.InMemoryDocumentIndex',
'device': 'cpu',
'metadata_fields': ['definition'],
'name_or_path': None,
'precision': None,
'separator': ' <def> ',
'use_faiss': False,
}
[2025-01-30 23:56:30,884] [INFO] [relik.retriever.indexers.base.from_pretrained:492] [PID:3289432] [RANK:0] Loading documents from /home/rjurney/.cache/huggingface/hub/models--sapienzanlp--relik-retriever-small-nyt-document-index/snapshots/7507014d674a84d622649acb1a8e0a7883bfec3f/documents.jsonl
[2025-01-30 23:56:30,885] [INFO] [relik.retriever.indexers.base.from_pretrained:535] [PID:3289432] [RANK:0] Loading embeddings from /home/rjurney/.cache/huggingface/hub/models--sapienzanlp--relik-retriever-small-nyt-document-index/snapshots/7507014d674a84d622649acb1a8e0a7883bfec3f/embeddings.pt
[2025-01-30 23:56:30,886] [INFO] [relik.retriever.indexers.inmemory.__init__:65] [PID:3289432] [RANK:0] Both documents and embeddings are provided.
[2025-01-30 23:57:35,999] [DEBUG] [relik.reader.data.relik_reader_re_data.__iter__:399] [PID:3289432] [RANK:0] Dataset finished: 328 number of elements processed
[ Triplets(subject=Span(start=677, end=683, label='--NME--', text='GraphX'), label='company', object=Span(start=654, end=659, label='--NME--', text='RDBMS'), confidence=0.9800000190734863),
Triplets(subject=Span(start=5167, end=5172, label='--NME--', text='Spark'), label='company location', object=Span(start=5156, end=5162, label='--NME--', text='Python'), confidence=1.0),
Triplets(subject=Span(start=7841, end=7846, label='--NME--', text='Spark'), label='company location', object=Span(start=7781, end=7787, label='--NME--', text='Python'), confidence=1.0),
Triplets(subject=Span(start=8493, end=8498, label='--NME--', text='Spark'), label='company location', object=Span(start=8481, end=8487, label='--NME--', text='Python'), confidence=1.0),
Triplets(subject=Span(start=12187, end=12203, label='--NME--', text='Spark DataFrames'), label='company location', object=Span(start=12227, end=12233, label='--NME--', text='Python'), confidence=0.9900000095367432),
Triplets(subject=Span(start=12687, end=12691, label='--NME--', text='Solr'), label='company location', object=Span(start=12905, end=12911, label='--NME--', text='Amazon'), confidence=0.9700000286102295),
Triplets(subject=Span(start=25259, end=25264, label='--NME--', text='Scala'), label='contains', object=Span(start=25152, end=25162, label='--NME--', text='GraphFrame'), confidence=1.0),
Triplets(subject=Span(start=32492, end=32498, label='--NME--', text='Pregel'), label='company location', object=Span(start=32503, end=32511, label='--NME--', text='GraphLab'), confidence=1.0),
Triplets(subject=Span(start=36661, end=36693, label='--NME--', text='SIGMOD (2010).\n[13] McA uley, J.'), label='company', object=Span(start=36661, end=36667, label='--NME--', text='SIGMOD'), confidence=0.9900000095367432),
Triplets(subject=Span(start=36691, end=36693, label='--NME--', text='J.'), label='company', object=Span(start=36661, end=36667, label='--NME--', text='SIGMOD'), confidence=1.0),
Triplets(subject=Span(start=36691, end=36693, label='--NME--', text='J.'), label='company', object=Span(start=36661, end=36689, label='--NME--', text='SIGMOD (2010).\n[13] McA uley'), confidence=1.0),
Triplets(subject=Span(start=36691, end=36693, label='--NME--', text='J.'), label='company', object=Span(start=36661, end=36693, label='--NME--', text='SIGMOD (2010).\n[13] McA uley, J.'), confidence=0.949999988079071),
Triplets(subject=Span(start=36943, end=36948, label='--NME--', text='M. A.'), label='company', object=Span(start=36953, end=36960, label='--NME--', text='Gremlin'), confidence=1.0),
Triplets(subject=Span(start=38539, end=38544, label='--NME--', text='Huang'), label='company', object=Span(start=38469, end=38479, label='--NME--', text='GraphFrame'), confidence=1.0)]
Relik relation extraction time for 48,155 characters: 62.961591 seconds
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment