This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###### Imports ###### | |
import argparse | |
import os | |
from pyspark.sql.functions import udf | |
from pyspark.sql.functions import lit | |
from pyspark.sql import SQLContext, SparkSession | |
from pyspark.sql.types import * | |
import pyspark | |
from pyspark import SparkFiles |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## 0. PySpark environment, including necessary JAR files for accessing S3 from Spark | |
os.environ['AWS_ACCESS_KEY_ID'] = 'xxx_access_key' | |
os.environ['AWS_SECRET_ACCESS_KEY'] = 'xxx_secret' | |
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python2' | |
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.hadoop:hadoop-aws:2.7.1,com.amazonaws:aws-java-sdk-pom:1.10.34,com.databricks:spark-csv_2.11:1.3.0 pyspark-shell' | |
# 1. Connect to S3 Bucket | |
s3 = boto3.client("s3") | |
s3_resource = boto3.resource('s3') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Extract BERT word embeddings. getBertWordVectors.sh | |
input_file=./bert_sentences.txt | |
output_file=./bertWordVectors.jsonl | |
BERT_BASE_DIR="$PRE_TRAINED_HOME/bert/uncased_L-12_H-768_A-12" | |
bert_master=./bert_master | |
pipenv run python $bert_master/extract_features.py \ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from py2neo import Graph | |
graph = Graph("bolt://localhost:7687", user="xxxxx", password="xxxx") | |
graph.delete_all() | |
graph.run(query_rider) | |
query_climbs = """ | |
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/inserpio/tour-de-france-2014/master/tour-de-france-2014-0003-climbs.csv" AS csvLine | |
MATCH (s:Stage { number: toInt(csvLine.STAGE_NUMBER) }) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.corpus import wordnet as wn | |
from nltk.corpus import wordnet_ic | |
dog=wn.synsets('dog', pos=wn.NOUN)[0] #get the first noun synonym of the word "dog" | |
cat=wn.synsets('cat', pos=wn.NOUN)[0] | |
rose=wn.synsets('rose', pos=wn.NOUN)[0] | |
flower=wn.synsets('flower', pos=wn.NOUN)[0] | |
brown_ic = wordnet_ic.ic('ic-brown.dat') #load the brown corpus to compute the IC |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module_url = "https://tfhub.dev/google/universal-sentence-encoder/1?tf-hub-format=compressed" | |
# Import the Universal Sentence Encoder's TF Hub module | |
embed = hub.Module(module_url) | |
# sample text | |
messages = [ | |
# Smartphones | |
"My phone is not good.", | |
"Your cellphone looks great.", |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def jensen_shannon(query, matrix): | |
""" | |
This function implements a Jensen-Shannon similarity | |
between the input query (an LDA topic distribution for a document) | |
and the entire corpus of topic distributions. | |
It returns an array of length M where M is the number of documents in the corpus | |
""" | |
# lets keep with the p,q notation above | |
p = query[None,:].T # take transpose | |
q = matrix.T # transpose matrix |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def cosine_distance_wordembedding_method(s1, s2): | |
import scipy | |
vector_1 = np.mean([model[word] for word in preprocess(s1)],axis=0) | |
vector_2 = np.mean([model[word] for word in preprocess(s2)],axis=0) | |
cosine = scipy.spatial.distance.cosine(vector_1, vector_2) | |
print('Word Embedding method with a cosine distance asses that our two sentences are similar to',round((1-cosine)*100,2),'%') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def jaccard_similarity(query, document): | |
intersection = set(query).intersection(set(document)) | |
union = set(query).union(set(document)) | |
return len(intersection)/len(union) |