Skip to content

Instantly share code, notes, and snippets.

View adsieg's full-sized avatar
🌴
On vacation

AdrienSieg adsieg

🌴
On vacation
View GitHub Profile
###### Imports ######
import argparse
import os
from pyspark.sql.functions import udf
from pyspark.sql.functions import lit
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import *
import pyspark
from pyspark import SparkFiles
## 0. PySpark environment, including necessary JAR files for accessing S3 from Spark
os.environ['AWS_ACCESS_KEY_ID'] = 'xxx_access_key'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'xxx_secret'
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python2'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.hadoop:hadoop-aws:2.7.1,com.amazonaws:aws-java-sdk-pom:1.10.34,com.databricks:spark-csv_2.11:1.3.0 pyspark-shell'
# 1. Connect to S3 Bucket
s3 = boto3.client("s3")
s3_resource = boto3.resource('s3')
#!/bin/bash
# Extract BERT word embeddings. getBertWordVectors.sh
input_file=./bert_sentences.txt
output_file=./bertWordVectors.jsonl
BERT_BASE_DIR="$PRE_TRAINED_HOME/bert/uncased_L-12_H-768_A-12"
bert_master=./bert_master
pipenv run python $bert_master/extract_features.py \
@adsieg
adsieg / neo4j.py
Last active September 25, 2019 14:27
from py2neo import Graph
graph = Graph("bolt://localhost:7687", user="xxxxx", password="xxxx")
graph.delete_all()
graph.run(query_rider)
query_climbs = """
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/inserpio/tour-de-france-2014/master/tour-de-france-2014-0003-climbs.csv" AS csvLine
MATCH (s:Stage { number: toInt(csvLine.STAGE_NUMBER) })
@adsieg
adsieg / knowledge_based_measure.py
Created May 7, 2019 13:06
knowledge_based_measure
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
dog=wn.synsets('dog', pos=wn.NOUN)[0] #get the first noun synonym of the word "dog"
cat=wn.synsets('cat', pos=wn.NOUN)[0]
rose=wn.synsets('rose', pos=wn.NOUN)[0]
flower=wn.synsets('flower', pos=wn.NOUN)[0]
brown_ic = wordnet_ic.ic('ic-brown.dat') #load the brown corpus to compute the IC
@adsieg
adsieg / Universal_sentence_encoder.py
Created May 6, 2019 15:28
Universal Sentence Encoder
module_url = "https://tfhub.dev/google/universal-sentence-encoder/1?tf-hub-format=compressed"
# Import the Universal Sentence Encoder's TF Hub module
embed = hub.Module(module_url)
# sample text
messages = [
# Smartphones
"My phone is not good.",
"Your cellphone looks great.",
@adsieg
adsieg / Jensen_Shannon_distance.py
Created May 6, 2019 14:50
Jensen Shannon distance
def jensen_shannon(query, matrix):
"""
This function implements a Jensen-Shannon similarity
between the input query (an LDA topic distribution for a document)
and the entire corpus of topic distributions.
It returns an array of length M where M is the number of documents in the corpus
"""
# lets keep with the p,q notation above
p = query[None,:].T # take transpose
q = matrix.T # transpose matrix
@adsieg
adsieg / CosineDistance.py
Created May 6, 2019 13:01
Cosine Similarity
def cosine_distance_wordembedding_method(s1, s2):
import scipy
vector_1 = np.mean([model[word] for word in preprocess(s1)],axis=0)
vector_2 = np.mean([model[word] for word in preprocess(s2)],axis=0)
cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
print('Word Embedding method with a cosine distance asses that our two sentences are similar to',round((1-cosine)*100,2),'%')
@adsieg
adsieg / JaccardSimilarity.py
Created May 6, 2019 12:58
Jaccard Similarity
def jaccard_similarity(query, document):
intersection = set(query).intersection(set(document))
union = set(query).union(set(document))
return len(intersection)/len(union)