Skip to content

Instantly share code, notes, and snippets.

View janfait's full-sized avatar

Jan Fait janfait

  • bizmachine.com
  • Czech Republic
View GitHub Profile
######################################################################################################
# SETUP
######################################################################################################
#libraries
library(ggplot2)
library(scales)
library(dplyr)
library(reshape2)
library(stringr)
@janfait
janfait / milionScraper.R
Last active April 3, 2018 08:20
Scraper pro projekt milionchvilek.cz
###############
# SETUP
###############
library(XML)
library(httr)
library(ggplot2)
library(dplyr)
#if a RDS file has been saved previously,
lazyLoad <- T
###############
#################
# SETUP
#################
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.functions import lit
from pyspark.sql.functions import col, avg
from pyspark.sql.types import *
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 30 20:26:43 2017
@author: Jan Fait, [email protected]
"""
import os
import requests
import datetime
mappDmp <- setRefClass("mappDmp",
fields=list(
debug = "logical",
root = "character",
endpoints = "list",
username = "character",
password = "character",
authentication = "list",
@janfait
janfait / py_uaparser.py
Created August 16, 2016 08:39
user agent parser invoked from cmd line with input as argument
##############
# SETUP
##############
# run this file by submitting an R command system("python /home/r_shared/py_uaparser.py --infile /way/to/your/log/file.csv")
# the parsed file - if successful - will be created in the same directory, just file_parsed.csv
# the structure of the input file must be consistent, there are no mechanisms that locate the UA string other than spliting
# the line input by separators
import csv, json, re, sys, argparse, time
@janfait
janfait / ftp_search_load.R
Last active July 20, 2016 15:04
searching for a specific pattern on FTP and loading it to a folder
library(stringr)
library(RCurl)
#get current time string
thisMonth <- format(Sys.Date(),"%m")
thisYear <- format(Sys.Date(),"%Y")
timeStamp <- paste0(thisMonth,thisYear)
#build a file pattern to search for by joining cluster names and timestamp
filePattern <- paste(c("EMC","LC"),timeStamp,sep="_",collapse="|")
@janfait
janfait / weighted_jaccard.R
Created July 20, 2016 12:59
custom weighted jaccard index for comparison of weighted sets in content+preference matching
userPreferences <- c(rep("mobile",6),rep("email",4))
contentTags <- c("email","mobile")
similarityIndex <- function(content_tags=NULL,user_preferences=NULL){
#weighted intersect of content_tags and user_preference, returns a sum of weights of tags which are in both sets
wi <- sum(sapply(intersect(content_tags,user_preferences),function(i) sum(i==user_preferences)))
#weighted union of all items entering the computation (content_tags have a weight==1)
wu <- length(user_preferences)+length(setdiff(content_tags,user_preferences))
@janfait
janfait / ggsurv.R
Last active June 9, 2016 08:06
Edwin Thoen's ggsurv plotting function for survfit objects from the survival package - cure to terrible basic plots
ggsurv <- function(s, CI = 'def', plot.cens = T, surv.col = 'gg.def',
cens.col = 'red', lty.est = 1, lty.ci = 2,
cens.shape = 3, back.white = F, xlab = 'Time',
ylab = 'Survival', main = ''){
library(ggplot2)
strata <- ifelse(is.null(s$strata) ==T, 1, length(s$strata))
stopifnot(length(surv.col) == 1 | length(surv.col) == strata)
stopifnot(length(lty.est) == 1 | length(lty.est) == strata)
@janfait
janfait / encrypt.R
Created June 8, 2016 11:57
encryption function which is, well, impossible to decrypt
encrypt <- function(v) {
#get uniques
vU <- unique(v)
vU <- vU[!is.na(vU) & vU!=""]
#apply the encryption over a vector
vE <- sapply(vU, function(x){
oL <- c(LETTERS, letters)
oN <- 0:9