Created
November 4, 2016 02:32
-
-
Save jexp/d3110f05b25914ffb698775f02d04696 to your computer and use it in GitHub Desktop.
Load and query the Microsoft Concept Graph in Neo4j https://concept.research.microsoft.com/Home/Introduction
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function import_extract_first { | |
echo "name:ID(Concept)" > concepts.txt | |
cat data-concept-instance-relations.txt | cut -d $'\t' -f 1 | sort | uniq >> concepts.txt | |
echo "name:ID(Instance)" > instances.txt | |
cat data-concept-instance-relations.txt | cut -d $'\t' -f 2 | sort | uniq >> instances.txt | |
echo $':END_ID(Concept)\t:START_ID(Instance) relations:int' > is_a.hdr | |
$NEO4J_HOME/bin/neo4j-import --into concepts.db --id-type string --delimiter TAB --bad-tolerance 13000000 --skip-duplicate-nodes true --skip-bad-relationships true \ | |
--nodes:Concept concepts.txt \ | |
--nodes:Instance instances.txt \ | |
--relationships:IS_A is_a.hdr,data-concept-instance-relations.txt | |
} | |
function import_skip_duplicates { | |
echo $'name:ID(Concept)\t:IGNORE\t:IGNORE' > concept.hdr | |
echo $':IGNORE name:ID(Instance)\t:IGNORE' > instance.hdr | |
echo $':END_ID(Concept)\t:START_ID(Instance) relations:int' > is_a.hdr | |
$NEO4J_HOME/bin/neo4j-import --into concepts.db --id-type string --delimiter TAB --bad-tolerance 13000000 --skip-duplicate-nodes true --skip-bad-relationships true \ | |
--nodes:Concept concept.hdr,data-concept-instance-relations.txt \ | |
--nodes:Instance instance.hdr,data-concept-instance-relations.txt \ | |
--relationships:IS_A is_a.hdr,data-concept-instance-relations.txt | |
} | |
if [ ! -f data-concept-instance-relations.txt ]; do | |
echo Download & Unzip from here: https://concept.research.microsoft.com/Home/Download | |
exit 1 | |
# curl -IL -o data-concept.zip 'https://concept.research.microsoft.com/Home/DownloadData?key=S54IIWamuQc9YD1WQA6tzKULMMZ79xK6&h=2127477893' | |
# unzip -j data-concept.zip | |
fi | |
export NEO4J_HOME=${NEO4J_HOME-/data/versions/neo4j-enterprise-3.0.7} | |
import_extract_first | |
echo $' | |
CREATE CONSTRAINT ON (i:Instance) ASSERT i.name IS UNIQUE;\n | |
CREATE CONSTRAINT ON (c:Concept) ASSERT c.name IS UNIQUE;' | $NEO4J_HOME/bin/neo4j-shell -path concepts.db | |
echo << EOF | |
IMPORT DONE in 1m 27s 888ms. | |
Imported: | |
17878053 nodes | |
33377320 relationships | |
51255373 properties | |
Peak memory usage: 410.36 MB | |
cypher runtime=compiled profile | |
MATCH (i:Instance {name:"apple"})-[r:IS_A]->(c:Concept) | |
RETURN i.name, r.relations, c.name ORDER BY r.relations DESC LIMIT 10; | |
MATCH (i:Instance {name:"apple"})-[r:IS_A]->(c:Concept) | |
> RETURN i.name, r.relations, c.name ORDER BY r.relations DESC LIMIT 10; | |
+----------------------------------------+ | |
| i.name | r.relations | c.name | | |
+----------------------------------------+ | |
| "apple" | 6315 | "fruit" | | |
| "apple" | 4353 | "company" | | |
| "apple" | 1152 | "food" | | |
| "apple" | 764 | "brand" | | |
| "apple" | 750 | "fresh fruit" | | |
| "apple" | 568 | "fruit tree" | | |
| "apple" | 483 | "crop" | | |
| "apple" | 280 | "corporation" | | |
| "apple" | 279 | "manufacturer" | | |
| "apple" | 257 | "firm" | | |
+----------------------------------------+ | |
10 rows | |
20 ms | |
explain | |
MATCH (a:Instance {name:"apple"})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:"pie"}) | |
USING INDEX a:Instance(name) | |
USING INDEX b:Instance(name) | |
MATCH (c)<-[:IS_A]-(o) WHERE o <> a and o <> b | |
WITH o, count(*) as freq order by freq desc limit 10 | |
RETURN o.name, freq; | |
export a_name="apple" | |
export b_name="pie" | |
export b_name="ipad" | |
MATCH (a:Instance {name:{a_name}})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:{b_name}}) | |
USING INDEX a:Instance(name) | |
USING INDEX b:Instance(name) | |
MATCH (c)<-[:IS_A]-(o) | |
WITH o, count(*) as freq order by freq desc SKIP 2 limit 10 | |
RETURN o.name, freq; | |
export a_name="apple" | |
export b_name="pie" | |
# export b_name="ipad" | |
cypher runtime=compiled profile | |
MATCH (a:Instance {name:{a_name}})-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b:Instance {name:{b_name}}) | |
USING INDEX a:Instance(name) | |
USING INDEX b:Instance(name) | |
RETURN c.name, r1.relations+r2.relations as freq order by freq desc limit 10; | |
+--------------------------+ | |
| c.name | freq | | |
+--------------------------+ | |
| "device" | 1139 | | |
| "mobile device" | 998 | | |
| "brand" | 772 | | |
| "item" | 396 | | |
| "product" | 320 | | |
| "player" | 201 | | |
| "technology" | 191 | | |
| "apple product" | 182 | | |
| "client" | 147 | | |
| "portable device" | 139 | | |
+--------------------------+ | |
10 rows | |
16 ms | |
export a_name="apple" | |
export b_name="pie" | |
# export b_name="ipad" | |
MATCH (a:Instance {name:{a_name}})-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b:Instance {name:{b_name}}) | |
USING INDEX a:Instance(name) | |
USING INDEX b:Instance(name) | |
RETURN c.name, r1.relations+r2.relations as freq order by freq desc limit 10; | |
+----------------------+ | |
| c.name | freq | | |
+----------------------+ | |
| "fruit" | 6316 | | |
| "food" | 1408 | | |
| "item" | 345 | | |
| "product" | 268 | | |
| "dessert" | 259 | | |
| "flavor" | 221 | | |
| "baked goods" | 209 | | |
| "ingredient" | 184 | | |
| "business" | 144 | | |
| "snack" | 144 | | |
+----------------------+ | |
10 rows | |
15 ms | |
MATCH (a:Instance {name:"apple"})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:"pie"}) | |
USING INDEX a:Instance(name) | |
USING INDEX b:Instance(name) | |
RETURN count(*); | |
explain | |
with "the apple engineer is eating the apple" as sentence | |
unwind split(sentence," ") as word | |
match (i:Instance {name:word}) | |
with collect(i) as instances | |
unwind range(0,length(instances)-2) as idx | |
with idx, instances[idx] as a, instances[idx+1] as b | |
MATCH (a)-[:IS_A]->(c:Concept)<-[:IS_A]-(b) | |
MATCH (c)<-[:IS_A]-(o) WHERE o <> a AND o <> b | |
WITH idx, a, b, o, count(*) as freq order by freq desc | |
RETURN idx, a.name, b.name, collect(o.name)[0..5] as meanings order by idx; | |
+-----------------------------------------------------------------------------------------------------------------------+ | |
| idx | a.name | b.name | meanings | | |
+-----------------------------------------------------------------------------------------------------------------------+ | |
| 0 | "the" | "apple" | ["vehicle","light","money","tobacco","television"] | | |
| 1 | "apple" | "engineer" | ["bank","university","doctor","school","google"] | | |
| 2 | "engineer" | "is" | ["coos","fructose","armour","starseeds","centaur"] | | |
| 3 | "is" | "eating" | ["compensation","ukuyigxoba","next","offside","process learning"] | | |
| 4 | "eating" | "the" | ["effective honest","crushed leg","drogue parachutes","marine sergeant","nano sized"] | | |
| 5 | "the" | "apple" | ["family","sandwich","vehicle","poison","door"] | | |
+-----------------------------------------------------------------------------------------------------------------------+ | |
with "the apple engineer is eating the apple" as sentence | |
unwind split(sentence," ") as word | |
match (i:Instance {name:word}) | |
WHERE NOT (i)-[:IS_A]->(:Concept {name:"stop words"}) | |
with collect(i) as instances | |
unwind range(0,length(instances)-2) as idx | |
with idx, instances[idx] as a, instances[idx+1] as b | |
MATCH (a)-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b) | |
WITH idx, a, b, c, r1.relations+r2.relations as freq order by freq desc | |
RETURN idx, a.name, b.name, collect({concept:c.name, relations:freq})[0..5] as concepts order by idx; | |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | |
| idx | a.name | b.name | concepts | | |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | |
| 0 | "apple" | "engineer" | [{concept -> "company", relations -> 4363},{concept -> "firm", relations -> 260},{concept -> "product", relations -> 182},{concept -> "person", relations -> 172},{concept -> "vendor", relations -> 169}] | | |
| 1 | "engineer" | "is" | [{concept -> "word", relations -> 25}] | | |
| 2 | "is" | "eating" | [{concept -> "word", relations -> 4}] | | |
| 3 | "eating" | "apple" | [{concept -> "fruit", relations -> 6316},{concept -> "activity", relations -> 736},{concept -> "item", relations -> 253},{concept -> "product", relations -> 182},{concept -> "business", relations -> 144}] | | |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | |
with "the apple engineer is eating the apple" as sentence | |
unwind split(sentence," ") as word | |
match (i:Instance {name:word}) | |
MATCH (i)-[r1:IS_A]->(c:Concept {name:"stop words"}) | |
return i,c limit 10; | |
EOF |
When I set the starting phrase, one at a time, to each of the 3 instance pairs: "apple engineer", "engineer eating" and "eating apple", each one, individually runs fine. ...1 minute or less ...usually about half a minute.
Do you still have a dump of the original IsA dataset?
Sorry, no perhaps on the internet archive?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Michael,
The "the apple engineer is eating the apple" example performs well but it's a lucky test case. Try removing "is" from the phrase and the query doesn't complete. It appears to thrash or something but never returns. You can remove the "the"s as well as the "is" because "the" is an instance of the concept "stop words" i.e. it gets removed anyway. For example, "apple engineer eating apple"
The issue is with the 3rd MATCH clause (where "o" is unbounded). I'm guessing there is some sort of relationship data explosion happening for one of the 3 instance pairs: "apple engineer", "engineer eating" or "eating apple".
Any ideas for a better formulation of this query?
Best regards,
Michael Herman (Toronto)