jexp · November 4, 2016 02:32 · dagelf · Sep 16, 2024 · jexp · Oct 24, 2024
diff --git a/ms-concepts-import.sh b/ms-concepts-import.sh
 function import_extract_first {
  echo "name:ID(Concept)" > concepts.txt
  cat data-concept-instance-relations.txt | cut -d $'\t' -f 1 | sort | uniq >> concepts.txt
  
  echo "name:ID(Instance)" > instances.txt
  cat data-concept-instance-relations.txt | cut -d $'\t' -f 2 | sort | uniq >> instances.txt

  echo $':END_ID(Concept)\t:START_ID(Instance)	relations:int' > is_a.hdr
  
  $NEO4J_HOME/bin/neo4j-import --into concepts.db --id-type string --delimiter TAB --bad-tolerance 13000000 --skip-duplicate-nodes true --skip-bad-relationships true \
  --nodes:Concept concepts.txt \
  --nodes:Instance instances.txt \
  --relationships:IS_A is_a.hdr,data-concept-instance-relations.txt
 }

 function import_skip_duplicates {
  echo $'name:ID(Concept)\t:IGNORE\t:IGNORE' > concept.hdr
  echo $':IGNORE	name:ID(Instance)\t:IGNORE' > instance.hdr
  echo $':END_ID(Concept)\t:START_ID(Instance)	relations:int' > is_a.hdr
  
  $NEO4J_HOME/bin/neo4j-import --into concepts.db --id-type string --delimiter TAB --bad-tolerance 13000000 --skip-duplicate-nodes true --skip-bad-relationships true \
  --nodes:Concept concept.hdr,data-concept-instance-relations.txt \
  --nodes:Instance instance.hdr,data-concept-instance-relations.txt \
  --relationships:IS_A is_a.hdr,data-concept-instance-relations.txt
 }

 if [ ! -f data-concept-instance-relations.txt ]; do
  echo Download & Unzip from here: https://concept.research.microsoft.com/Home/Download
  exit 1
 #  curl -IL -o data-concept.zip 'https://concept.research.microsoft.com/Home/DownloadData?key=S54IIWamuQc9YD1WQA6tzKULMMZ79xK6&h=2127477893' 
 #  unzip -j data-concept.zip
 fi

 export NEO4J_HOME=${NEO4J_HOME-/data/versions/neo4j-enterprise-3.0.7}

 import_extract_first

 echo $'
 CREATE CONSTRAINT ON (i:Instance) ASSERT i.name IS  UNIQUE;\n
 CREATE CONSTRAINT ON (c:Concept)  ASSERT c.name IS  UNIQUE;' | $NEO4J_HOME/bin/neo4j-shell -path concepts.db

 echo << EOF

 IMPORT DONE in 1m 27s 888ms.
 Imported:
  17878053 nodes
  33377320 relationships
  51255373 properties
 Peak memory usage: 410.36 MB

 cypher runtime=compiled profile
 MATCH (i:Instance {name:"apple"})-[r:IS_A]->(c:Concept)
 RETURN i.name, r.relations, c.name ORDER BY r.relations DESC LIMIT 10;

 MATCH (i:Instance {name:"apple"})-[r:IS_A]->(c:Concept)
 > RETURN i.name, r.relations, c.name ORDER BY r.relations DESC LIMIT 10;
 +----------------------------------------+
 | i.name  | r.relations | c.name         |
 +----------------------------------------+
 | "apple" | 6315        | "fruit"        |
 | "apple" | 4353        | "company"      |
 | "apple" | 1152        | "food"         |
 | "apple" | 764         | "brand"        |
 | "apple" | 750         | "fresh fruit"  |
 | "apple" | 568         | "fruit tree"   |
 | "apple" | 483         | "crop"         |
 | "apple" | 280         | "corporation"  |
 | "apple" | 279         | "manufacturer" |
 | "apple" | 257         | "firm"         |
 +----------------------------------------+
 10 rows
 20 ms

 explain
 MATCH (a:Instance {name:"apple"})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:"pie"})
 USING INDEX a:Instance(name)
 USING INDEX b:Instance(name)
 MATCH (c)<-[:IS_A]-(o) WHERE o <> a and o <> b
 WITH o, count(*) as freq order by freq desc limit 10
 RETURN o.name, freq;

 export a_name="apple"
 export b_name="pie"
 export b_name="ipad"
 MATCH (a:Instance {name:{a_name}})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:{b_name}})
 USING INDEX a:Instance(name)
 USING INDEX b:Instance(name)
 MATCH (c)<-[:IS_A]-(o)
 WITH o, count(*) as freq order by freq desc SKIP 2 limit 10
 RETURN o.name, freq;


 export a_name="apple"
 export b_name="pie"
 # export b_name="ipad"
 cypher runtime=compiled profile
 MATCH (a:Instance {name:{a_name}})-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b:Instance {name:{b_name}})
 USING INDEX a:Instance(name)
 USING INDEX b:Instance(name)
 RETURN c.name, r1.relations+r2.relations as freq order by freq desc limit 10;


 +--------------------------+
 | c.name            | freq |
 +--------------------------+
 | "device"          | 1139 |
 | "mobile device"   | 998  |
 | "brand"           | 772  |
 | "item"            | 396  |
 | "product"         | 320  |
 | "player"          | 201  |
 | "technology"      | 191  |
 | "apple product"   | 182  |
 | "client"          | 147  |
 | "portable device" | 139  |
 +--------------------------+
 10 rows
 16 ms



 export a_name="apple"
 export b_name="pie"
 # export b_name="ipad"
 MATCH (a:Instance {name:{a_name}})-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b:Instance {name:{b_name}})
 USING INDEX a:Instance(name)
 USING INDEX b:Instance(name)
 RETURN c.name, r1.relations+r2.relations as freq order by freq desc limit 10;


 +----------------------+
 | c.name        | freq |
 +----------------------+
 | "fruit"       | 6316 |
 | "food"        | 1408 |
 | "item"        | 345  |
 | "product"     | 268  |
 | "dessert"     | 259  |
 | "flavor"      | 221  |
 | "baked goods" | 209  |
 | "ingredient"  | 184  |
 | "business"    | 144  |
 | "snack"       | 144  |
 +----------------------+
 10 rows
 15 ms


 MATCH (a:Instance {name:"apple"})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:"pie"})
 USING INDEX a:Instance(name)
 USING INDEX b:Instance(name)
 RETURN count(*);

 explain
 with "the apple engineer is eating the apple" as sentence
 unwind split(sentence," ") as word
 match (i:Instance {name:word})
 with collect(i) as instances
 unwind range(0,length(instances)-2) as idx
 with idx, instances[idx] as a, instances[idx+1] as b
 MATCH (a)-[:IS_A]->(c:Concept)<-[:IS_A]-(b)
 MATCH (c)<-[:IS_A]-(o) WHERE o <> a AND o <> b
 WITH idx, a, b, o, count(*) as freq order by freq desc
 RETURN idx, a.name, b.name, collect(o.name)[0..5] as meanings order by idx;


 +-----------------------------------------------------------------------------------------------------------------------+
 | idx | a.name     | b.name     | meanings                                                                              |
 +-----------------------------------------------------------------------------------------------------------------------+
 | 0   | "the"      | "apple"    | ["vehicle","light","money","tobacco","television"]                                    |
 | 1   | "apple"    | "engineer" | ["bank","university","doctor","school","google"]                                      |
 | 2   | "engineer" | "is"       | ["coos","fructose","armour","starseeds","centaur"]                                    |
 | 3   | "is"       | "eating"   | ["compensation","ukuyigxoba","next","offside","process learning"]                     |
 | 4   | "eating"   | "the"      | ["effective honest","crushed leg","drogue parachutes","marine sergeant","nano sized"] |
 | 5   | "the"      | "apple"    | ["family","sandwich","vehicle","poison","door"]                                       |
 +-----------------------------------------------------------------------------------------------------------------------+

 with "the apple engineer is eating the apple" as sentence
 unwind split(sentence," ") as word
 match (i:Instance {name:word})
 WHERE NOT (i)-[:IS_A]->(:Concept {name:"stop words"})
 with collect(i) as instances
 unwind range(0,length(instances)-2) as idx
 with idx, instances[idx] as a, instances[idx+1] as b
 MATCH (a)-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b)
 WITH idx, a, b, c, r1.relations+r2.relations as freq order by freq desc
 RETURN idx, a.name, b.name, collect({concept:c.name, relations:freq})[0..5] as concepts order by idx;


 +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | idx | a.name     | b.name     | concepts                                                                                                                                                                                                     |
 +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | 0   | "apple"    | "engineer" | [{concept -> "company", relations -> 4363},{concept -> "firm", relations -> 260},{concept -> "product", relations -> 182},{concept -> "person", relations -> 172},{concept -> "vendor", relations -> 169}]   |
 | 1   | "engineer" | "is"       | [{concept -> "word", relations -> 25}]                                                                                                                                                                       |
 | 2   | "is"       | "eating"   | [{concept -> "word", relations -> 4}]                                                                                                                                                                        |
 | 3   | "eating"   | "apple"    | [{concept -> "fruit", relations -> 6316},{concept -> "activity", relations -> 736},{concept -> "item", relations -> 253},{concept -> "product", relations -> 182},{concept -> "business", relations -> 144}] |
 +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+


 with "the apple engineer is eating the apple" as sentence
 unwind split(sentence," ") as word
 match (i:Instance {name:word})
 MATCH (i)-[r1:IS_A]->(c:Concept {name:"stop words"})
 return i,c limit 10;

 EOF
	function import_extract_first {
	echo "name:ID(Concept)" > concepts.txt
	cat data-concept-instance-relations.txt \| cut -d $'\t' -f 1 \| sort \| uniq >> concepts.txt

	echo "name:ID(Instance)" > instances.txt
	cat data-concept-instance-relations.txt \| cut -d $'\t' -f 2 \| sort \| uniq >> instances.txt

	echo $':END_ID(Concept)\t:START_ID(Instance) relations:int' > is_a.hdr

	$NEO4J_HOME/bin/neo4j-import --into concepts.db --id-type string --delimiter TAB --bad-tolerance 13000000 --skip-duplicate-nodes true --skip-bad-relationships true \
	--nodes:Concept concepts.txt \
	--nodes:Instance instances.txt \
	--relationships:IS_A is_a.hdr,data-concept-instance-relations.txt
	}

	function import_skip_duplicates {
	echo $'name:ID(Concept)\t:IGNORE\t:IGNORE' > concept.hdr
	echo $':IGNORE name:ID(Instance)\t:IGNORE' > instance.hdr
	echo $':END_ID(Concept)\t:START_ID(Instance) relations:int' > is_a.hdr

	$NEO4J_HOME/bin/neo4j-import --into concepts.db --id-type string --delimiter TAB --bad-tolerance 13000000 --skip-duplicate-nodes true --skip-bad-relationships true \
	--nodes:Concept concept.hdr,data-concept-instance-relations.txt \
	--nodes:Instance instance.hdr,data-concept-instance-relations.txt \
	--relationships:IS_A is_a.hdr,data-concept-instance-relations.txt
	}

	if [ ! -f data-concept-instance-relations.txt ]; do
	echo Download & Unzip from here: https://concept.research.microsoft.com/Home/Download
	exit 1
	# curl -IL -o data-concept.zip 'https://concept.research.microsoft.com/Home/DownloadData?key=S54IIWamuQc9YD1WQA6tzKULMMZ79xK6&h=2127477893'
	# unzip -j data-concept.zip
	fi

	export NEO4J_HOME=${NEO4J_HOME-/data/versions/neo4j-enterprise-3.0.7}

	import_extract_first

	echo $'
	CREATE CONSTRAINT ON (i:Instance) ASSERT i.name IS UNIQUE;\n
	CREATE CONSTRAINT ON (c:Concept) ASSERT c.name IS UNIQUE;' \| $NEO4J_HOME/bin/neo4j-shell -path concepts.db

	echo << EOF

	IMPORT DONE in 1m 27s 888ms.
	Imported:
	17878053 nodes
	33377320 relationships
	51255373 properties
	Peak memory usage: 410.36 MB

	cypher runtime=compiled profile
	MATCH (i:Instance {name:"apple"})-[r:IS_A]->(c:Concept)
	RETURN i.name, r.relations, c.name ORDER BY r.relations DESC LIMIT 10;

	MATCH (i:Instance {name:"apple"})-[r:IS_A]->(c:Concept)
	> RETURN i.name, r.relations, c.name ORDER BY r.relations DESC LIMIT 10;
	+----------------------------------------+
	\| i.name \| r.relations \| c.name \|
	+----------------------------------------+
	\| "apple" \| 6315 \| "fruit" \|
	\| "apple" \| 4353 \| "company" \|
	\| "apple" \| 1152 \| "food" \|
	\| "apple" \| 764 \| "brand" \|
	\| "apple" \| 750 \| "fresh fruit" \|
	\| "apple" \| 568 \| "fruit tree" \|
	\| "apple" \| 483 \| "crop" \|
	\| "apple" \| 280 \| "corporation" \|
	\| "apple" \| 279 \| "manufacturer" \|
	\| "apple" \| 257 \| "firm" \|
	+----------------------------------------+
	10 rows
	20 ms

	explain
	MATCH (a:Instance {name:"apple"})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:"pie"})
	USING INDEX a:Instance(name)
	USING INDEX b:Instance(name)
	MATCH (c)<-[:IS_A]-(o) WHERE o <> a and o <> b
	WITH o, count(*) as freq order by freq desc limit 10
	RETURN o.name, freq;

	export a_name="apple"
	export b_name="pie"
	export b_name="ipad"
	MATCH (a:Instance {name:{a_name}})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:{b_name}})
	USING INDEX a:Instance(name)
	USING INDEX b:Instance(name)
	MATCH (c)<-[:IS_A]-(o)
	WITH o, count(*) as freq order by freq desc SKIP 2 limit 10
	RETURN o.name, freq;


	export a_name="apple"
	export b_name="pie"
	# export b_name="ipad"
	cypher runtime=compiled profile
	MATCH (a:Instance {name:{a_name}})-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b:Instance {name:{b_name}})
	USING INDEX a:Instance(name)
	USING INDEX b:Instance(name)
	RETURN c.name, r1.relations+r2.relations as freq order by freq desc limit 10;


	+--------------------------+
	\| c.name \| freq \|
	+--------------------------+
	\| "device" \| 1139 \|
	\| "mobile device" \| 998 \|
	\| "brand" \| 772 \|
	\| "item" \| 396 \|
	\| "product" \| 320 \|
	\| "player" \| 201 \|
	\| "technology" \| 191 \|
	\| "apple product" \| 182 \|
	\| "client" \| 147 \|
	\| "portable device" \| 139 \|
	+--------------------------+
	10 rows
	16 ms



	export a_name="apple"
	export b_name="pie"
	# export b_name="ipad"
	MATCH (a:Instance {name:{a_name}})-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b:Instance {name:{b_name}})
	USING INDEX a:Instance(name)
	USING INDEX b:Instance(name)
	RETURN c.name, r1.relations+r2.relations as freq order by freq desc limit 10;


	+----------------------+
	\| c.name \| freq \|
	+----------------------+
	\| "fruit" \| 6316 \|
	\| "food" \| 1408 \|
	\| "item" \| 345 \|
	\| "product" \| 268 \|
	\| "dessert" \| 259 \|
	\| "flavor" \| 221 \|
	\| "baked goods" \| 209 \|
	\| "ingredient" \| 184 \|
	\| "business" \| 144 \|
	\| "snack" \| 144 \|
	+----------------------+
	10 rows
	15 ms


	MATCH (a:Instance {name:"apple"})-[:IS_A]->(c:Concept)<-[:IS_A]-(b:Instance {name:"pie"})
	USING INDEX a:Instance(name)
	USING INDEX b:Instance(name)
	RETURN count(*);

	explain
	with "the apple engineer is eating the apple" as sentence
	unwind split(sentence," ") as word
	match (i:Instance {name:word})
	with collect(i) as instances
	unwind range(0,length(instances)-2) as idx
	with idx, instances[idx] as a, instances[idx+1] as b
	MATCH (a)-[:IS_A]->(c:Concept)<-[:IS_A]-(b)
	MATCH (c)<-[:IS_A]-(o) WHERE o <> a AND o <> b
	WITH idx, a, b, o, count(*) as freq order by freq desc
	RETURN idx, a.name, b.name, collect(o.name)[0..5] as meanings order by idx;


	+-----------------------------------------------------------------------------------------------------------------------+
	\| idx \| a.name \| b.name \| meanings \|
	+-----------------------------------------------------------------------------------------------------------------------+
	\| 0 \| "the" \| "apple" \| ["vehicle","light","money","tobacco","television"] \|
	\| 1 \| "apple" \| "engineer" \| ["bank","university","doctor","school","google"] \|
	\| 2 \| "engineer" \| "is" \| ["coos","fructose","armour","starseeds","centaur"] \|
	\| 3 \| "is" \| "eating" \| ["compensation","ukuyigxoba","next","offside","process learning"] \|
	\| 4 \| "eating" \| "the" \| ["effective honest","crushed leg","drogue parachutes","marine sergeant","nano sized"] \|
	\| 5 \| "the" \| "apple" \| ["family","sandwich","vehicle","poison","door"] \|
	+-----------------------------------------------------------------------------------------------------------------------+

	with "the apple engineer is eating the apple" as sentence
	unwind split(sentence," ") as word
	match (i:Instance {name:word})
	WHERE NOT (i)-[:IS_A]->(:Concept {name:"stop words"})
	with collect(i) as instances
	unwind range(0,length(instances)-2) as idx
	with idx, instances[idx] as a, instances[idx+1] as b
	MATCH (a)-[r1:IS_A]->(c:Concept)<-[r2:IS_A]-(b)
	WITH idx, a, b, c, r1.relations+r2.relations as freq order by freq desc
	RETURN idx, a.name, b.name, collect({concept:c.name, relations:freq})[0..5] as concepts order by idx;


	+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
	\| idx \| a.name \| b.name \| concepts \|
	+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
	\| 0 \| "apple" \| "engineer" \| [{concept -> "company", relations -> 4363},{concept -> "firm", relations -> 260},{concept -> "product", relations -> 182},{concept -> "person", relations -> 172},{concept -> "vendor", relations -> 169}] \|
	\| 1 \| "engineer" \| "is" \| [{concept -> "word", relations -> 25}] \|
	\| 2 \| "is" \| "eating" \| [{concept -> "word", relations -> 4}] \|
	\| 3 \| "eating" \| "apple" \| [{concept -> "fruit", relations -> 6316},{concept -> "activity", relations -> 736},{concept -> "item", relations -> 253},{concept -> "product", relations -> 182},{concept -> "business", relations -> 144}] \|
	+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+


	with "the apple engineer is eating the apple" as sentence
	unwind split(sentence," ") as word
	match (i:Instance {name:word})
	MATCH (i)-[r1:IS_A]->(c:Concept {name:"stop words"})
	return i,c limit 10;

	EOF