Last active
February 12, 2024 16:11
-
-
Save MarketingPip/4547daa2b745d620e63325c175c8d5f4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
async function getWordInfo(words, langCode = 'en') { | |
const endpointUrl = 'https://query.wikidata.org/sparql'; | |
const sparqlQuery = ` | |
SELECT ?word ?lemma (GROUP_CONCAT(DISTINCT ?category; separator="|\$|") AS ?grammar) | |
(GROUP_CONCAT(DISTINCT ?forms; separator="|\$|") AS ?LexIDs) | |
(GROUP_CONCAT(DISTINCT ?gloss; separator="|\$|") AS ?Senses) | |
(GROUP_CONCAT(DISTINCT ?feat2; separator="|\$|") AS ?Uses) | |
(GROUP_CONCAT(DISTINCT ?usagewords; separator="|\$|") AS ?SameMeaning) | |
WHERE { | |
VALUES ?word {${words.map(word => `'${word}'@${langCode}`).join(' ')}} | |
?l a ontolex:LexicalEntry ; | |
dct:language wd:Q1860 ; | |
wikibase:lemma ?lemma ; | |
ontolex:lexicalForm ?form. | |
OPTIONAL { | |
?l wikibase:lexicalCategory ?cat . | |
?cat rdfs:label ?category. FILTER(LANG(?category) = "${langCode}"). | |
} | |
?l ontolex:lexicalForm ?forms . | |
?forms wikibase:grammaticalFeature ?features. | |
?features rdfs:label ?feat2. FILTER(LANG(?feat2) = "${langCode}"). | |
?forms ontolex:representation ?usagewords . | |
?form ontolex:representation ?word . | |
?l ontolex:sense ?sense . | |
?sense skos:definition ?gloss. | |
FILTER(LANG(?gloss) = "${langCode}") | |
} | |
GROUP BY ?word ?lemma`; | |
const headers = { 'Accept': 'application/sparql-results+json' }; | |
const fullUrl = endpointUrl + '?query=' + encodeURIComponent(sparqlQuery); | |
try { | |
const response = await fetch(fullUrl, { headers }); | |
const results = await response.json(); | |
mapToSchema(results); | |
} catch (error) { | |
console.error('Error fetching data:', error); | |
} | |
} | |
const getKeyByValue = (obj, value) => Object.keys(obj).find(key => obj[key] === value); | |
const compromiseMapping = { | |
CC: 'Conjunction', | |
CD: 'Cardinal', | |
DT: 'Determiner', | |
EX: 'Preposition', //Existential there | |
FW: 'Expression', | |
IN: 'Preposition', | |
JJ: 'Adjective', | |
JJR: 'Comparative', | |
JJS: 'Superlative', | |
MD: 'Modal', | |
NN: 'Noun', | |
NNS: 'Plural', | |
NNP: 'Singular', | |
NNPS: ' Plural', | |
POS: 'Possessive', | |
PRP: 'Pronoun', | |
RB: 'Adverb', | |
RBR: 'Comparative', | |
RBS: 'Superlative', | |
RP: 'PhrasalVerb', | |
PDT: 'Determiner', | |
SYM: 'Expression', | |
TO: 'Conjunction', | |
UH: 'Expression', | |
VB: 'Verb', | |
VBD: 'PastTense', | |
VBG: 'Gerund', | |
VBN: 'Participle', // past participle | |
VBP: 'PresentTense', // non-3rd person singular present | |
VBZ: 'PresentTense', // 3rd person singular present | |
'PRP$': 'Pronoun', | |
'WP$': 'Possessive', | |
WDT: 'Determiner', | |
WP: 'Pronoun', | |
WRB: 'Adverb', | |
} | |
const verbFormsMapping = { | |
"simple past": "VBD", | |
"past participle in english": "VBN", | |
"present participle": "VBG", | |
"plural": "NNS", | |
"singular": "NNP", | |
"third person": "VBZ", | |
"first person singular": "VBP", | |
"second person singular": "VB", | |
"third person plural": "VBP", | |
"infinitive": "VB", | |
"present": "VBP", | |
"past": "VBD", | |
"gerund": "VBG", | |
"positive": "JJ", | |
"comparative": "JJR", | |
"superlative": "JJS", | |
}; | |
function mapToSchema(results) { | |
results = results.results.bindings; | |
console.log(results)// | |
const mappedResults = { | |
words: [] | |
}; | |
results.forEach(result => { | |
const wordInfo = { | |
word: result.word.value, | |
pos: result.grammar.value.split('|$|'), | |
tags:null, | |
lemma:null, | |
wikidata: result.LexIDs.value.split('|$|').map(result => result.split("/").pop()), | |
forms: null, | |
senses: [] | |
}; | |
// | |
if(wordInfo.word.toLowerCase() != result.lemma.value.toLowerCase()){ | |
wordInfo.lemma = result.lemma.value | |
} | |
const senses = result.Senses.value.split('|$|'); | |
const uses = result.Uses.value.split('|$|'); | |
const sameMeaning = result.SameMeaning.value.split('|$|'); | |
const postypes = {}; | |
wordInfo.senses = [...senses] || []; | |
for (let i = 0; i < uses.length; i++) { | |
postypes[[uses[i]]] = sameMeaning[i] || sameMeaning[sameMeaning.length - 1]; | |
// Need help here spencer - theses arent mapped right. Assuming we need to change SPARQL query? | |
} | |
wordInfo.forms = postypes; | |
const type = getKeyByValue(wordInfo.forms, wordInfo.word) | |
const penn = verbFormsMapping[type] | |
wordInfo.tags = {wikidata:type, penn:penn, compromise:compromiseMapping[penn]} | |
// wordInfo.forms.filter(obj => obj.intent === "2017-07-12T14:41:15"); | |
mappedResults.words.push(wordInfo); | |
}); | |
console.log(mappedResults); | |
} | |
// Example usage: | |
const wordsToQuery = ['hated', 'hate', 'going', 'go']; | |
getWordInfo(wordsToQuery); |
@spencermountian - tagged you in this originally to ask for help but I think I got it figured out! (revised the gist uptop). Would be appreciated if you could test it out and see if any errors etc...
Let me know what you think too! 😄
@spencermountain - don't know if you ever got to peak at this. But if we solved it I think would be useful. Plus you can feel free to use it to get all your lemmas for words needed etc.... (works properly).
cool way to get wordnet data - glad it's useful for you!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@spencermountain - don't mean to be a bothersome. Was hoping on to tag you when I had finished results but stumped right now (doesn't help I think I have a concussion right now lol).
You can see the features for words like this here.
And you can try visualizing it here.
Some words have multiple uses - but not sure how to properly map them.
Example the query returns:
Which it would be ideal to have them mapped in lexicon -
Again - apologizes to ask for your help, but I figured you wouldn't mind considering I built this mostly to help you / the compromise.js project lol.
Hopefully this helps somewhat - took me hours to figure out proper query and not get crazy timeouts etc. lol (plus you won't have to worry about me making PR's with word's that do not have proper tags now hahaha!)