AdrienSieg adsieg

🌴

On vacation

From Paris, France. Studied in Economics & Statistics - Data Scientist in the making - NLP-Lover - Rock-climbing, Traveling and cheese enthusiast

adsieg / spotifytop50_spark.py

Created April 24, 2020 15:17

	###### Imports ######
	import argparse
	import os
	from pyspark.sql.functions import udf
	from pyspark.sql.functions import lit
	from pyspark.sql import SQLContext, SparkSession
	from pyspark.sql.types import *
	import pyspark
	from pyspark import SparkFiles

adsieg / aws_spark_context.py

Last active April 24, 2020 15:30

	## 0. PySpark environment, including necessary JAR files for accessing S3 from Spark
	os.environ['AWS_ACCESS_KEY_ID'] = 'xxx_access_key'
	os.environ['AWS_SECRET_ACCESS_KEY'] = 'xxx_secret'
	os.environ['PYSPARK_PYTHON'] = '/usr/bin/python2'
	os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.hadoop:hadoop-aws:2.7.1,com.amazonaws:aws-java-sdk-pom:1.10.34,com.databricks:spark-csv_2.11:1.3.0 pyspark-shell'

	# 1. Connect to S3 Bucket
	s3 = boto3.client("s3")
	s3_resource = boto3.resource('s3')

adsieg / getBertWordVectors.sh

Last active November 13, 2019 08:12

	#!/bin/bash
	# Extract BERT word embeddings. getBertWordVectors.sh

	input_file=./bert_sentences.txt
	output_file=./bertWordVectors.jsonl

	BERT_BASE_DIR="$PRE_TRAINED_HOME/bert/uncased_L-12_H-768_A-12"
	bert_master=./bert_master

	pipenv run python $bert_master/extract_features.py \

adsieg / neo4j.py

Last active September 25, 2019 14:27

	from py2neo import Graph

	graph = Graph("bolt://localhost:7687", user="xxxxx", password="xxxx")
	graph.delete_all()

	graph.run(query_rider)

	query_climbs = """
	LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/inserpio/tour-de-france-2014/master/tour-de-france-2014-0003-climbs.csv" AS csvLine
	MATCH (s:Stage { number: toInt(csvLine.STAGE_NUMBER) })

adsieg / knowledge_based_measure.py

Created May 7, 2019 13:06

knowledge_based_measure

	from nltk.corpus import wordnet as wn
	from nltk.corpus import wordnet_ic

	dog=wn.synsets('dog', pos=wn.NOUN)[0] #get the first noun synonym of the word "dog"
	cat=wn.synsets('cat', pos=wn.NOUN)[0]
	rose=wn.synsets('rose', pos=wn.NOUN)[0]
	flower=wn.synsets('flower', pos=wn.NOUN)[0]

	brown_ic = wordnet_ic.ic('ic-brown.dat') #load the brown corpus to compute the IC

adsieg / Universal_sentence_encoder.py

Created May 6, 2019 15:28

Universal Sentence Encoder

	module_url = "https://tfhub.dev/google/universal-sentence-encoder/1?tf-hub-format=compressed"

	# Import the Universal Sentence Encoder's TF Hub module
	embed = hub.Module(module_url)

	# sample text
	messages = [
	# Smartphones
	"My phone is not good.",
	"Your cellphone looks great.",

adsieg / Jensen_Shannon_distance.py

Created May 6, 2019 14:50

Jensen Shannon distance

	def jensen_shannon(query, matrix):
	"""
	This function implements a Jensen-Shannon similarity
	between the input query (an LDA topic distribution for a document)
	and the entire corpus of topic distributions.
	It returns an array of length M where M is the number of documents in the corpus
	"""
	# lets keep with the p,q notation above
	p = query[None,:].T # take transpose
	q = matrix.T # transpose matrix

adsieg / CosineDistance.py

Created May 6, 2019 13:01

Cosine Similarity

	def cosine_distance_wordembedding_method(s1, s2):
	import scipy
	vector_1 = np.mean([model[word] for word in preprocess(s1)],axis=0)
	vector_2 = np.mean([model[word] for word in preprocess(s2)],axis=0)
	cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
	print('Word Embedding method with a cosine distance asses that our two sentences are similar to',round((1-cosine)*100,2),'%')

adsieg / JaccardSimilarity.py

Created May 6, 2019 12:58

Jaccard Similarity

	def jaccard_similarity(query, document):
	intersection = set(query).intersection(set(document))
	union = set(query).union(set(document))
	return len(intersection)/len(union)