|
""" |
|
table: |
|
book_id, tag_id, weight, boom |
|
|
|
insert/update: |
|
boom = boomfilter(book_tag_ids) |
|
|
|
select: |
|
q = boomfilter(query_tag_ids) |
|
select * from table |
|
where (tag_id in query_tag_ids) and (boom & q = q) |
|
""" |
|
|
|
|
|
def fnv(n, p): |
|
# http://isthe.com/chongo/tech/comp/fnv/ |
|
p = (p * 16777619) & 0xffffffff |
|
return (n ^ p) * p |
|
|
|
|
|
def boomfilter(numbers): |
|
# https://llimllib.github.io/bloomfilter-tutorial/zh_CN/ |
|
m = 64 |
|
# k个随机质数, k=(m/n)ln(2), n=平均numbers个数 |
|
p_s = [389749, 834913, 271129, 691693, 964009, 963481] |
|
boom = 0 |
|
for n in numbers: |
|
for p in p_s: |
|
i = fnv(n, p) % m |
|
boom = boom | (1 << i) |
|
return boom |
|
|
|
|
|
def is_match(boom, numbers): |
|
# https://stackoverflow.com/questions/360844/mysql-bitwise-operations-bloom-filter |
|
q = boomfilter(numbers) |
|
return (boom & q) == q |
|
|
|
|
|
def main(): |
|
import itertools |
|
import random |
|
tags = [random.randint(0, 2**16 - 1) for _ in range(8)] |
|
not_tags = [random.randint(0, 2**16 - 1) for _ in range(8)] |
|
boom = boomfilter(tags) |
|
print('{:b}'.format(boom)) |
|
for t in tags: |
|
match = is_match(boom, [t]) |
|
print(f'tag {t} match={match}') |
|
for t_s in itertools.combinations(tags, 2): |
|
match = is_match(boom, t_s) |
|
print(f'tag {t_s} match={match}') |
|
for t in not_tags: |
|
match = is_match(boom, [t]) |
|
print(f'not tag {t} match={match}') |
|
for t_s in itertools.combinations(not_tags, 2): |
|
match = is_match(boom, t_s) |
|
print(f'not tag {t_s} match={match}') |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |