Skip to content

Instantly share code, notes, and snippets.

@Aniketh01
Last active March 24, 2020 16:02
Show Gist options
  • Save Aniketh01/588009747126af694eeccf18fa056246 to your computer and use it in GitHub Desktop.
Save Aniketh01/588009747126af694eeccf18fa056246 to your computer and use it in GitHub Desktop.
import pandas as pd
from glob import glob
from functools import reduce
import re
import datetime
import sys
import argparse
def find_baseline(dirpath):
alexa_columns = ["Rank", "Domain"]
baseline = ['Domain', 'baseline']
limit = 100000
list_dfs = []
toplist = sorted(glob(dirpath + '/*'))
for file in toplist:
match = re.search("([0-9]{4}\-[0-9]{2}\-[0-9]{2})", file)
date = datetime.datetime.strptime(match.group(), '%Y-%m-%d').date()
df = pd.read_csv(file, names=alexa_columns, nrows=limit, header=None)
df2 = df.assign(Date=date)
list_dfs.append(df2)
frame = pd.concat(list_dfs, axis=0, ignore_index=True)
data = pd.DataFrame(frame)
domain_groups = data.groupby(['Domain', 'Date', 'Rank']).count().reset_index()
data = pd.DataFrame(domain_groups)
data['baseline'] = data.groupby(['Domain'], as_index=True)[
['Rank']].transform(lambda g: g.rolling(7).mean())
data = data.groupby(['Domain']).tail(1).round()
data.sort_values(by=['baseline'], inplace=True)
data.to_csv('baseline.csv', columns=baseline, index=False)
#TODO: plot grah per domain on the rolling mean
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-b", "--baseline_dir", help="Input baseline directory",
required=True)
args = vars(parser.parse_args())
find_baseline(args['baseline_dir'])
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment