mingrammer · January 23, 2018 02:55
diff --git a/seoul-metro-max-min-in-out-elasticsearch.py b/seoul-metro-max-min-in-out-elasticsearch.py
 import math
 from pprint import pprint
 import elasticsearch as es
 import numpy as np

 # Constants
 INDEX_NAME = 'seoul-metro-2014'
 THRESHOLD = 10
 CHUNK_SIZE = 5000

 # Global holders
 stations = []
 in_out_of_stations = []
 gap_rank_list = []

 client = es.Elasticsearch(['http://doit-dev.lkaybob.pe.kr'], port=9200)

 results = client.search(
    index=INDEX_NAME,
    body={
        'aggs': {
            'unique_station_names': {
                'terms': {
                    'field': 'station_name',
                    'size': 0,
                },
            },
        },
    },
 )

 for entry in results['aggregations']['unique_station_names']['buckets']:
    stations.append({
        'name': entry['key'],
        'count': entry['doc_count'],
    })

 for station in stations:
    loop = math.ceil(station['count'] / CHUNK_SIZE)

    for i in range(loop):
        result = client.search(
            index=INDEX_NAME,
            doc_type='seoul-metro',
            body={
                'query': {
                    'constant_score': {
                        'filter': {
                            'term': {
                                'station_name': station['name']
                            }
                        }
                    }
                },
                'aggs': {
                    'amount_per_day': {
                        'date_histogram': {
                            'field': 'time_slot',
                            'interval': 'day'
                        },
                        'aggs': {
                            'total_in': {
                                'sum': {
                                    'field': 'people_in'
                                }
                            },
                            'total_out': {
                                'sum': {
                                    'field': 'people_out'
                                }
                            }
                        }
                    }
                }
            },
            size=CHUNK_SIZE
        )

        people_ins = []
        people_outs = []

        for bucket in result['aggregations']['amount_per_day']['buckets']:
            people_ins.append({
                'date': bucket['key_as_string'],
                'total': bucket['total_in']['value']
            })
            people_outs.append({
                'date': bucket['key_as_string'],
                'total': bucket['total_out']['value']
            })

    people_ins.sort(reverse=True, key=lambda d: d['total'])
    people_outs.sort(reverse=True, key=lambda d: d['total'])

    in_out_of_a_station = {
        'station': station['name'],
        'highest_people_ins': people_ins[:THRESHOLD],
        'highest_people_outs': people_outs[:THRESHOLD],
        'lowest_people_ins': people_ins[-1:-THRESHOLD:-1],
        'lowest_people_outs': people_outs[-1:-THRESHOLD:-1]
    }
    highest_gap = {
        'station': in_out_of_a_station['station'],
        'date_for_ins': in_out_of_a_station['highest_people_ins'][0]['date'],
        'date_for_outs': in_out_of_a_station['highest_people_outs'][0]['date'], 
        'highest_people_ins_gap': in_out_of_a_station['highest_people_ins'][0]['total'] - np.mean(list(map(lambda d: d['total'], people_ins))),
        'highest_people_outs_gap': in_out_of_a_station['highest_people_outs'][0]['total'] - np.mean(list(map(lambda d: d['total'], people_outs)))
    }
    pprint(in_out_of_a_station)
    pprint(highest_gap)
    in_out_of_stations.append(in_out_of_a_station)
    gap_rank_list.append(highest_gap)

 gap_rank_list.sort(reverse=True, key=lambda d: d['highest_people_ins_gap'])
 pprint(gap_rank_list[:20])
	import math
	from pprint import pprint
	import elasticsearch as es
	import numpy as np

	# Constants
	INDEX_NAME = 'seoul-metro-2014'
	THRESHOLD = 10
	CHUNK_SIZE = 5000

	# Global holders
	stations = []
	in_out_of_stations = []
	gap_rank_list = []

	client = es.Elasticsearch(['http://doit-dev.lkaybob.pe.kr'], port=9200)

	results = client.search(
	index=INDEX_NAME,
	body={
	'aggs': {
	'unique_station_names': {
	'terms': {
	'field': 'station_name',
	'size': 0,
	},
	},
	},
	},
	)

	for entry in results['aggregations']['unique_station_names']['buckets']:
	stations.append({
	'name': entry['key'],
	'count': entry['doc_count'],
	})

	for station in stations:
	loop = math.ceil(station['count'] / CHUNK_SIZE)

	for i in range(loop):
	result = client.search(
	index=INDEX_NAME,
	doc_type='seoul-metro',
	body={
	'query': {
	'constant_score': {
	'filter': {
	'term': {
	'station_name': station['name']
	}
	}
	}
	},
	'aggs': {
	'amount_per_day': {
	'date_histogram': {
	'field': 'time_slot',
	'interval': 'day'
	},
	'aggs': {
	'total_in': {
	'sum': {
	'field': 'people_in'
	}
	},
	'total_out': {
	'sum': {
	'field': 'people_out'
	}
	}
	}
	}
	}
	},
	size=CHUNK_SIZE
	)

	people_ins = []
	people_outs = []

	for bucket in result['aggregations']['amount_per_day']['buckets']:
	people_ins.append({
	'date': bucket['key_as_string'],
	'total': bucket['total_in']['value']
	})
	people_outs.append({
	'date': bucket['key_as_string'],
	'total': bucket['total_out']['value']
	})

	people_ins.sort(reverse=True, key=lambda d: d['total'])
	people_outs.sort(reverse=True, key=lambda d: d['total'])

	in_out_of_a_station = {
	'station': station['name'],
	'highest_people_ins': people_ins[:THRESHOLD],
	'highest_people_outs': people_outs[:THRESHOLD],
	'lowest_people_ins': people_ins[-1:-THRESHOLD:-1],
	'lowest_people_outs': people_outs[-1:-THRESHOLD:-1]
	}
	highest_gap = {
	'station': in_out_of_a_station['station'],
	'date_for_ins': in_out_of_a_station['highest_people_ins'][0]['date'],
	'date_for_outs': in_out_of_a_station['highest_people_outs'][0]['date'],
	'highest_people_ins_gap': in_out_of_a_station['highest_people_ins'][0]['total'] - np.mean(list(map(lambda d: d['total'], people_ins))),
	'highest_people_outs_gap': in_out_of_a_station['highest_people_outs'][0]['total'] - np.mean(list(map(lambda d: d['total'], people_outs)))
	}
	pprint(in_out_of_a_station)
	pprint(highest_gap)
	in_out_of_stations.append(in_out_of_a_station)
	gap_rank_list.append(highest_gap)

	gap_rank_list.sort(reverse=True, key=lambda d: d['highest_people_ins_gap'])
	pprint(gap_rank_list[:20])