Last active
October 11, 2021 21:08
-
-
Save chrismwendt/0109758559e537c8cbd9bb67d7b3be57 to your computer and use it in GitHub Desktop.
Script for running SPARQL queries against LSIF dumps for debugging (rdflib and oxigraph)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from pygments import highlight | |
from pygments.lexers import PythonLexer | |
from pygments.formatters import Terminal256Formatter | |
from pprint import pformat | |
import os | |
import json | |
from prettytable import PrettyTable | |
from colored import stylize, fg | |
from pyoxigraph import MemoryStore, NamedNode, NamedNode, Literal, Quad | |
import time | |
import urllib | |
import re | |
# oxigraph is annoying to use because all values need to be valid URIs, so | |
# you end up having to encode/decode arbitrary data into URIs. | |
# Also, I'm not sure if it supports aggregations. | |
def main(): | |
# os.chdir('/Users/chrismwendt/github.com/sourcegraph/lsif-go') | |
# subprocess.run('go build -o lsif-go ./cmd/lsif-go', shell=True, check=True) | |
os.chdir('/Users/chrismwendt/github.com/sourcegraph/scratch') | |
# os.chdir('/Users/chrismwendt/github.com/sourcegraph/src-cli') | |
# subprocess.run('/Users/chrismwendt/github.com/sourcegraph/lsif-go/lsif-go', shell=True, check=True) | |
store = MemoryStore() | |
def encode(v): | |
return 'scheme:' + urllib.parse.quote(str(v)) | |
def decode(v): | |
return urllib.parse.unquote(v.removeprefix('scheme:')) | |
def l(v): | |
return '<scheme:' + urllib.parse.quote(str(v)) + '>' | |
def nn(v): | |
return NamedNode(encode(v)) | |
def add(a, b, c): | |
store.add(Quad(nn(a), nn(b), nn(c))) | |
def query(q): | |
print('before', q) | |
q = re.sub(r':(\w+)', lambda match: l(match.group(1)), q) | |
print('after ', q) | |
start = time.time() | |
z = store.query(q) | |
print(time.time() - start) | |
x = PrettyTable() | |
x.field_names = [v for v in z.variables] | |
for row in z: | |
x.add_row([decode(x.value) if x is not None else "NONE" for x in row]) | |
x.align = "l" | |
print(x) | |
with open('/Users/chrismwendt/github.com/sourcegraph/scratch/dump.lsif') as dump_file: | |
# with open('/Users/chrismwendt/github.com/sourcegraph/src-cli/dump.lsif') as dump_file: | |
for line in dump_file.readlines(): | |
el = json.loads(line.rstrip()) | |
# outV -> from | |
# inV -> inVs | |
# inVs -> to | |
if el.get('outV') is not None: | |
el['from'] = el.pop('outV') | |
if el.get('inV') is not None: | |
el['inVs'] = [el.pop('inV')] | |
if el.get('inVs') is not None: | |
el['to'] = el.pop('inVs') | |
id = el.pop('id') | |
for k, v in flatten(el): | |
add(id, k, v) | |
print(stylize("vertex -edge-> vertex", fg("green"))) | |
query(""" | |
SELECT ("sup" AS ?foo) { | |
?id1 :label ?l1 . | |
} | |
""") | |
def flatten(d): | |
r = [] | |
for k, v in d.items(): | |
if type(v) is dict: | |
for k2, v2 in flatten(v): | |
r.append([f"{k}.{k2}", v2]) | |
elif type(v) is list: | |
for vi in v: | |
r.append([k, vi]) | |
else: | |
r.append([k, v]) | |
return r | |
def flat_list(l): | |
return [item for sublist in l for item in sublist] | |
def p(obj): | |
print(highlight(pformat(obj), PythonLexer(), Terminal256Formatter())) | |
if __name__ == '__main__': | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from rdflib import Graph, Literal | |
from rdflib.namespace import Namespace | |
from pygments import highlight | |
from pygments.lexers import PythonLexer | |
from pygments.formatters import Terminal256Formatter | |
from pprint import pformat | |
import os | |
import json | |
from prettytable import PrettyTable | |
from colored import stylize, fg | |
def main(): | |
# os.chdir('/Users/chrismwendt/github.com/sourcegraph/lsif-go') | |
# subprocess.run('go build -o lsif-go ./cmd/lsif-go', shell=True, check=True) | |
os.chdir('/Users/chrismwendt/github.com/sourcegraph/scratch') | |
# os.chdir('/Users/chrismwendt/github.com/sourcegraph/src-cli') | |
# subprocess.run('/Users/chrismwendt/github.com/sourcegraph/lsif-go/lsif-go', shell=True, check=True) | |
g = Graph() | |
n = Namespace('') | |
g.bind("", n) | |
def add(a, b, c): | |
g.add([Literal(a), n[b], Literal(c)]) | |
def query(q): | |
z = g.query(q) | |
x = PrettyTable() | |
x.field_names = [v for v in z.vars] | |
for row in g.query(q): | |
x.add_row(row) | |
x.align = "l" | |
print(x) | |
with open('dump-before-doc-fix.lsif') as dump_file: | |
for line in dump_file.readlines(): | |
el = json.loads(line.rstrip()) | |
# outV -> from | |
# inV -> inVs | |
# inVs -> to | |
if el.get('outV') is not None: | |
el['from'] = el.pop('outV') | |
if el.get('inV') is not None: | |
el['inVs'] = [el.pop('inV')] | |
if el.get('inVs') is not None: | |
el['to'] = el.pop('inVs') | |
id = el.pop('id') | |
for k, v in flatten(el): | |
add(id, k, v) | |
print(stylize("vertex -edge-> vertex", fg("green"))) | |
query(""" | |
SELECT DISTINCT (CONCAT(?l1, " ", ?el, " ", ?l2) as ?v_e_v) { | |
?id1 :label ?l1 . | |
?id2 :label ?l2 . | |
?e :from ?id1 . | |
?e :label ?el . | |
?e :to ?id2 . | |
} | |
""") | |
print( | |
stylize("distinct :type + :label for elements with :document", | |
fg("green"))) | |
query(""" | |
SELECT DISTINCT (CONCAT(?type, ".", ?l) as ?type_label) { | |
?id1 :type ?type . | |
?id1 :label ?l . | |
?id1 :document ?_ . | |
} | |
""") | |
print( | |
stylize( | |
"distinct :label for nodes that have outgoing edges with :document", | |
fg("green"))) | |
query(""" | |
SELECT DISTINCT ?l { | |
?n1 :label ?l . | |
?e1 :from ?n1 . | |
?e1 :document ?_ . | |
} | |
""") | |
print(stylize("docs", fg("green"))) | |
query(""" | |
SELECT DISTINCT ?n1 ?uri { | |
?n1 :label "document" . | |
?n1 :uri ?uri . | |
} | |
""") | |
print(stylize("contains", fg("green"))) | |
query(""" | |
SELECT DISTINCT ?n1 (GROUP_CONCAT(?contains) as ?contains) { | |
?e1 :label "contains" . | |
?e1 :from ?n1 . | |
?e1 :to ?contains . | |
} | |
GROUP BY ?n1 | |
""") | |
print(stylize("edges with doc -> ranges", fg("green"))) | |
query(""" | |
SELECT DISTINCT ?doc (GROUP_CONCAT(?range) as ?ranges) { | |
?e1 :document ?doc . | |
?e1 :to ?range . | |
} | |
GROUP BY ?doc | |
""") | |
print(stylize("ranges and their docs", fg("green"))) | |
query(""" | |
SELECT DISTINCT ?range (GROUP_CONCAT(DISTINCT ?doc) as ?docs) { | |
?e1 :document ?doc . | |
?e1 :to ?range . | |
} | |
GROUP BY ?range | |
""") | |
def flatten(d): | |
r = [] | |
for k, v in d.items(): | |
if type(v) is dict: | |
for k2, v2 in flatten(v): | |
r.append([f"{k}.{k2}", v2]) | |
elif type(v) is list: | |
for vi in v: | |
r.append([k, vi]) | |
else: | |
r.append([k, v]) | |
return r | |
def flat_list(l): | |
return [item for sublist in l for item in sublist] | |
def p(obj): | |
print(highlight(pformat(obj), PythonLexer(), Terminal256Formatter())) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment