Skip to content

Instantly share code, notes, and snippets.

@jexp
Created November 4, 2016 02:32
Show Gist options
  • Save jexp/d3110f05b25914ffb698775f02d04696 to your computer and use it in GitHub Desktop.
Save jexp/d3110f05b25914ffb698775f02d04696 to your computer and use it in GitHub Desktop.
Load and query the Microsoft Concept Graph in Neo4j https://concept.research.microsoft.com/Home/Introduction
function import_extract_first {
echo "name:ID(Concept)" > concepts.txt
cat data-concept-instance-relations.txt | cut -d $'\t' -f 1 | sort | uniq >> concepts.txt
echo "name:ID(Instance)" > instances.txt
cat data-concept-instance-relations.txt | cut -d $'\t' -f 2 | sort | uniq >> instances.txt
echo $':END_ID(Concept)\t:START_ID(Instance) relations:int' > is_a.hdr
$NEO4J_HOME/bin/neo4j-import --into concepts.db --id-type string --delimiter TAB --bad-tolerance 13000000 --skip-duplicate-nodes true --skip-bad-relationships true \
--nodes:Concept concepts.txt \
--nodes:Instance instances.txt \
--relationships:IS_A is_a.hdr,data-concept-instance-relations.txt
}
function import_skip_duplicates {
echo $'name:ID(Concept)\t:IGNORE\t:IGNORE' > concept.hdr
echo $':IGNORE name:ID(Instance)\t:IGNORE' > instance.hdr
echo $':END_ID(Concept)\t:START_ID(Instance) relations:int' > is_a.hdr
$NEO4J_HOME/bin/neo4j-import --into concepts.db --id-type string --delimiter TAB --bad-tolerance 13000000 --skip-duplicate-nodes true --skip-bad-relationships true \
--nodes:Concept concept.hdr,data-concept-instance-relations.txt \
--nodes:Instance instance.hdr,data-concept-instance-relations.txt \
--relationships:IS_A is_a.hdr,data-concept-instance-relations.txt
}
if [ ! -f data-concept-instance-relations.txt ]; do
echo Download & Unzip from here: https://concept.research.microsoft.com/Home/Download
exit 1
# curl -IL -o data-concept.zip 'https://concept.research.microsoft.com/Home/DownloadData?key=S54IIWamuQc9YD1WQA6tzKULMMZ79xK6&h=2127477893'
# unzip -j data-concept.zip
fi
export NEO4J_HOME=${NEO4J_HOME-/data/versions/neo4j-enterprise-3.0.7}
import_extract_first
echo $'
CREATE CONSTRAINT ON (i:Instance) ASSERT i.name IS UNIQUE;\n
CREATE CONSTRAINT ON (c:Concept) ASSERT c.name IS UNIQUE;' | $NEO4J_HOME/bin/neo4j-shell -path concepts.db
echo << EOF
IMPORT DONE in 1m 27s 888ms.
Imported:
17878053 nodes
33377320 relationships
51255373 properties
Peak memory usage: 410.36 MB
cypher runtime=compiled profile
MATCH (i:Instance {name:"apple"})-[r:IS_A]->(c:Concept)
RETURN i.name, r.relations, c.name ORDER BY r.relations DESC LIMIT 10;
MATCH (i:Instance {name:"apple"})-[r:IS_A]->(c:Concept)
> RETURN i.name, r.relations, c.name ORDER BY r.relations DESC LIMIT 10;
+----------------------------------------+
| i.name | r.relations | c.name |
+----------------------------------------+
| "apple" | 6315 | "fruit" |
| "apple" | 4353 | "company" |
| "apple" | 1152 | "food" |
| "apple" | 764 | "brand" |
| "apple" | 750 | "fresh fruit" |
| "apple" | 568 | "fruit tree" |
| "apple" | 483 | "crop" |
| "apple" | 280 | "corporation" |
| "apple" | 279 | "manufacturer" |
| "apple" | 257 | "firm" |
+----------------------------------------+
10 rows
20 ms
explain
MATCH (a:Instance {name:"apple"})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:"pie"})
USING INDEX a:Instance(name)
USING INDEX b:Instance(name)
MATCH (c)<-[:IS_A]-(o) WHERE o <> a and o <> b
WITH o, count(*) as freq order by freq desc limit 10
RETURN o.name, freq;
export a_name="apple"
export b_name="pie"
export b_name="ipad"
MATCH (a:Instance {name:{a_name}})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:{b_name}})
USING INDEX a:Instance(name)
USING INDEX b:Instance(name)
MATCH (c)<-[:IS_A]-(o)
WITH o, count(*) as freq order by freq desc SKIP 2 limit 10
RETURN o.name, freq;
export a_name="apple"
export b_name="pie"
# export b_name="ipad"
cypher runtime=compiled profile
MATCH (a:Instance {name:{a_name}})-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b:Instance {name:{b_name}})
USING INDEX a:Instance(name)
USING INDEX b:Instance(name)
RETURN c.name, r1.relations+r2.relations as freq order by freq desc limit 10;
+--------------------------+
| c.name | freq |
+--------------------------+
| "device" | 1139 |
| "mobile device" | 998 |
| "brand" | 772 |
| "item" | 396 |
| "product" | 320 |
| "player" | 201 |
| "technology" | 191 |
| "apple product" | 182 |
| "client" | 147 |
| "portable device" | 139 |
+--------------------------+
10 rows
16 ms
export a_name="apple"
export b_name="pie"
# export b_name="ipad"
MATCH (a:Instance {name:{a_name}})-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b:Instance {name:{b_name}})
USING INDEX a:Instance(name)
USING INDEX b:Instance(name)
RETURN c.name, r1.relations+r2.relations as freq order by freq desc limit 10;
+----------------------+
| c.name | freq |
+----------------------+
| "fruit" | 6316 |
| "food" | 1408 |
| "item" | 345 |
| "product" | 268 |
| "dessert" | 259 |
| "flavor" | 221 |
| "baked goods" | 209 |
| "ingredient" | 184 |
| "business" | 144 |
| "snack" | 144 |
+----------------------+
10 rows
15 ms
MATCH (a:Instance {name:"apple"})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:"pie"})
USING INDEX a:Instance(name)
USING INDEX b:Instance(name)
RETURN count(*);
explain
with "the apple engineer is eating the apple" as sentence
unwind split(sentence," ") as word
match (i:Instance {name:word})
with collect(i) as instances
unwind range(0,length(instances)-2) as idx
with idx, instances[idx] as a, instances[idx+1] as b
MATCH (a)-[:IS_A]->(c:Concept)<-[:IS_A]-(b)
MATCH (c)<-[:IS_A]-(o) WHERE o <> a AND o <> b
WITH idx, a, b, o, count(*) as freq order by freq desc
RETURN idx, a.name, b.name, collect(o.name)[0..5] as meanings order by idx;
+-----------------------------------------------------------------------------------------------------------------------+
| idx | a.name | b.name | meanings |
+-----------------------------------------------------------------------------------------------------------------------+
| 0 | "the" | "apple" | ["vehicle","light","money","tobacco","television"] |
| 1 | "apple" | "engineer" | ["bank","university","doctor","school","google"] |
| 2 | "engineer" | "is" | ["coos","fructose","armour","starseeds","centaur"] |
| 3 | "is" | "eating" | ["compensation","ukuyigxoba","next","offside","process learning"] |
| 4 | "eating" | "the" | ["effective honest","crushed leg","drogue parachutes","marine sergeant","nano sized"] |
| 5 | "the" | "apple" | ["family","sandwich","vehicle","poison","door"] |
+-----------------------------------------------------------------------------------------------------------------------+
with "the apple engineer is eating the apple" as sentence
unwind split(sentence," ") as word
match (i:Instance {name:word})
WHERE NOT (i)-[:IS_A]->(:Concept {name:"stop words"})
with collect(i) as instances
unwind range(0,length(instances)-2) as idx
with idx, instances[idx] as a, instances[idx+1] as b
MATCH (a)-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b)
WITH idx, a, b, c, r1.relations+r2.relations as freq order by freq desc
RETURN idx, a.name, b.name, collect({concept:c.name, relations:freq})[0..5] as concepts order by idx;
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| idx | a.name | b.name | concepts |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 0 | "apple" | "engineer" | [{concept -> "company", relations -> 4363},{concept -> "firm", relations -> 260},{concept -> "product", relations -> 182},{concept -> "person", relations -> 172},{concept -> "vendor", relations -> 169}] |
| 1 | "engineer" | "is" | [{concept -> "word", relations -> 25}] |
| 2 | "is" | "eating" | [{concept -> "word", relations -> 4}] |
| 3 | "eating" | "apple" | [{concept -> "fruit", relations -> 6316},{concept -> "activity", relations -> 736},{concept -> "item", relations -> 253},{concept -> "product", relations -> 182},{concept -> "business", relations -> 144}] |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
with "the apple engineer is eating the apple" as sentence
unwind split(sentence," ") as word
match (i:Instance {name:word})
MATCH (i)-[r1:IS_A]->(c:Concept {name:"stop words"})
return i,c limit 10;
EOF
@dagelf
Copy link

dagelf commented Sep 16, 2024

Do you still have a dump of the original IsA dataset?

@jexp
Copy link
Author

jexp commented Oct 24, 2024

Sorry, no perhaps on the internet archive?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment