Created
November 4, 2016 02:32
-
-
Save jexp/d3110f05b25914ffb698775f02d04696 to your computer and use it in GitHub Desktop.
Load and query the Microsoft Concept Graph in Neo4j https://concept.research.microsoft.com/Home/Introduction
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function import_extract_first { | |
echo "name:ID(Concept)" > concepts.txt | |
cat data-concept-instance-relations.txt | cut -d $'\t' -f 1 | sort | uniq >> concepts.txt | |
echo "name:ID(Instance)" > instances.txt | |
cat data-concept-instance-relations.txt | cut -d $'\t' -f 2 | sort | uniq >> instances.txt | |
echo $':END_ID(Concept)\t:START_ID(Instance) relations:int' > is_a.hdr | |
$NEO4J_HOME/bin/neo4j-import --into concepts.db --id-type string --delimiter TAB --bad-tolerance 13000000 --skip-duplicate-nodes true --skip-bad-relationships true \ | |
--nodes:Concept concepts.txt \ | |
--nodes:Instance instances.txt \ | |
--relationships:IS_A is_a.hdr,data-concept-instance-relations.txt | |
} | |
function import_skip_duplicates { | |
echo $'name:ID(Concept)\t:IGNORE\t:IGNORE' > concept.hdr | |
echo $':IGNORE name:ID(Instance)\t:IGNORE' > instance.hdr | |
echo $':END_ID(Concept)\t:START_ID(Instance) relations:int' > is_a.hdr | |
$NEO4J_HOME/bin/neo4j-import --into concepts.db --id-type string --delimiter TAB --bad-tolerance 13000000 --skip-duplicate-nodes true --skip-bad-relationships true \ | |
--nodes:Concept concept.hdr,data-concept-instance-relations.txt \ | |
--nodes:Instance instance.hdr,data-concept-instance-relations.txt \ | |
--relationships:IS_A is_a.hdr,data-concept-instance-relations.txt | |
} | |
if [ ! -f data-concept-instance-relations.txt ]; do | |
echo Download & Unzip from here: https://concept.research.microsoft.com/Home/Download | |
exit 1 | |
# curl -IL -o data-concept.zip 'https://concept.research.microsoft.com/Home/DownloadData?key=S54IIWamuQc9YD1WQA6tzKULMMZ79xK6&h=2127477893' | |
# unzip -j data-concept.zip | |
fi | |
export NEO4J_HOME=${NEO4J_HOME-/data/versions/neo4j-enterprise-3.0.7} | |
import_extract_first | |
echo $' | |
CREATE CONSTRAINT ON (i:Instance) ASSERT i.name IS UNIQUE;\n | |
CREATE CONSTRAINT ON (c:Concept) ASSERT c.name IS UNIQUE;' | $NEO4J_HOME/bin/neo4j-shell -path concepts.db | |
echo << EOF | |
IMPORT DONE in 1m 27s 888ms. | |
Imported: | |
17878053 nodes | |
33377320 relationships | |
51255373 properties | |
Peak memory usage: 410.36 MB | |
cypher runtime=compiled profile | |
MATCH (i:Instance {name:"apple"})-[r:IS_A]->(c:Concept) | |
RETURN i.name, r.relations, c.name ORDER BY r.relations DESC LIMIT 10; | |
MATCH (i:Instance {name:"apple"})-[r:IS_A]->(c:Concept) | |
> RETURN i.name, r.relations, c.name ORDER BY r.relations DESC LIMIT 10; | |
+----------------------------------------+ | |
| i.name | r.relations | c.name | | |
+----------------------------------------+ | |
| "apple" | 6315 | "fruit" | | |
| "apple" | 4353 | "company" | | |
| "apple" | 1152 | "food" | | |
| "apple" | 764 | "brand" | | |
| "apple" | 750 | "fresh fruit" | | |
| "apple" | 568 | "fruit tree" | | |
| "apple" | 483 | "crop" | | |
| "apple" | 280 | "corporation" | | |
| "apple" | 279 | "manufacturer" | | |
| "apple" | 257 | "firm" | | |
+----------------------------------------+ | |
10 rows | |
20 ms | |
explain | |
MATCH (a:Instance {name:"apple"})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:"pie"}) | |
USING INDEX a:Instance(name) | |
USING INDEX b:Instance(name) | |
MATCH (c)<-[:IS_A]-(o) WHERE o <> a and o <> b | |
WITH o, count(*) as freq order by freq desc limit 10 | |
RETURN o.name, freq; | |
export a_name="apple" | |
export b_name="pie" | |
export b_name="ipad" | |
MATCH (a:Instance {name:{a_name}})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:{b_name}}) | |
USING INDEX a:Instance(name) | |
USING INDEX b:Instance(name) | |
MATCH (c)<-[:IS_A]-(o) | |
WITH o, count(*) as freq order by freq desc SKIP 2 limit 10 | |
RETURN o.name, freq; | |
export a_name="apple" | |
export b_name="pie" | |
# export b_name="ipad" | |
cypher runtime=compiled profile | |
MATCH (a:Instance {name:{a_name}})-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b:Instance {name:{b_name}}) | |
USING INDEX a:Instance(name) | |
USING INDEX b:Instance(name) | |
RETURN c.name, r1.relations+r2.relations as freq order by freq desc limit 10; | |
+--------------------------+ | |
| c.name | freq | | |
+--------------------------+ | |
| "device" | 1139 | | |
| "mobile device" | 998 | | |
| "brand" | 772 | | |
| "item" | 396 | | |
| "product" | 320 | | |
| "player" | 201 | | |
| "technology" | 191 | | |
| "apple product" | 182 | | |
| "client" | 147 | | |
| "portable device" | 139 | | |
+--------------------------+ | |
10 rows | |
16 ms | |
export a_name="apple" | |
export b_name="pie" | |
# export b_name="ipad" | |
MATCH (a:Instance {name:{a_name}})-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b:Instance {name:{b_name}}) | |
USING INDEX a:Instance(name) | |
USING INDEX b:Instance(name) | |
RETURN c.name, r1.relations+r2.relations as freq order by freq desc limit 10; | |
+----------------------+ | |
| c.name | freq | | |
+----------------------+ | |
| "fruit" | 6316 | | |
| "food" | 1408 | | |
| "item" | 345 | | |
| "product" | 268 | | |
| "dessert" | 259 | | |
| "flavor" | 221 | | |
| "baked goods" | 209 | | |
| "ingredient" | 184 | | |
| "business" | 144 | | |
| "snack" | 144 | | |
+----------------------+ | |
10 rows | |
15 ms | |
MATCH (a:Instance {name:"apple"})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:"pie"}) | |
USING INDEX a:Instance(name) | |
USING INDEX b:Instance(name) | |
RETURN count(*); | |
explain | |
with "the apple engineer is eating the apple" as sentence | |
unwind split(sentence," ") as word | |
match (i:Instance {name:word}) | |
with collect(i) as instances | |
unwind range(0,length(instances)-2) as idx | |
with idx, instances[idx] as a, instances[idx+1] as b | |
MATCH (a)-[:IS_A]->(c:Concept)<-[:IS_A]-(b) | |
MATCH (c)<-[:IS_A]-(o) WHERE o <> a AND o <> b | |
WITH idx, a, b, o, count(*) as freq order by freq desc | |
RETURN idx, a.name, b.name, collect(o.name)[0..5] as meanings order by idx; | |
+-----------------------------------------------------------------------------------------------------------------------+ | |
| idx | a.name | b.name | meanings | | |
+-----------------------------------------------------------------------------------------------------------------------+ | |
| 0 | "the" | "apple" | ["vehicle","light","money","tobacco","television"] | | |
| 1 | "apple" | "engineer" | ["bank","university","doctor","school","google"] | | |
| 2 | "engineer" | "is" | ["coos","fructose","armour","starseeds","centaur"] | | |
| 3 | "is" | "eating" | ["compensation","ukuyigxoba","next","offside","process learning"] | | |
| 4 | "eating" | "the" | ["effective honest","crushed leg","drogue parachutes","marine sergeant","nano sized"] | | |
| 5 | "the" | "apple" | ["family","sandwich","vehicle","poison","door"] | | |
+-----------------------------------------------------------------------------------------------------------------------+ | |
with "the apple engineer is eating the apple" as sentence | |
unwind split(sentence," ") as word | |
match (i:Instance {name:word}) | |
WHERE NOT (i)-[:IS_A]->(:Concept {name:"stop words"}) | |
with collect(i) as instances | |
unwind range(0,length(instances)-2) as idx | |
with idx, instances[idx] as a, instances[idx+1] as b | |
MATCH (a)-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b) | |
WITH idx, a, b, c, r1.relations+r2.relations as freq order by freq desc | |
RETURN idx, a.name, b.name, collect({concept:c.name, relations:freq})[0..5] as concepts order by idx; | |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | |
| idx | a.name | b.name | concepts | | |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | |
| 0 | "apple" | "engineer" | [{concept -> "company", relations -> 4363},{concept -> "firm", relations -> 260},{concept -> "product", relations -> 182},{concept -> "person", relations -> 172},{concept -> "vendor", relations -> 169}] | | |
| 1 | "engineer" | "is" | [{concept -> "word", relations -> 25}] | | |
| 2 | "is" | "eating" | [{concept -> "word", relations -> 4}] | | |
| 3 | "eating" | "apple" | [{concept -> "fruit", relations -> 6316},{concept -> "activity", relations -> 736},{concept -> "item", relations -> 253},{concept -> "product", relations -> 182},{concept -> "business", relations -> 144}] | | |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | |
with "the apple engineer is eating the apple" as sentence | |
unwind split(sentence," ") as word | |
match (i:Instance {name:word}) | |
MATCH (i)-[r1:IS_A]->(c:Concept {name:"stop words"}) | |
return i,c limit 10; | |
EOF |
Sorry, no perhaps on the internet archive?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Do you still have a dump of the original IsA dataset?