Created
July 18, 2021 05:16
-
-
Save MaLiN2223/c90017290d659340b3682d107fde77f0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bz2 | |
import lzma | |
from src.reddit_input_processing.zreader import Zreader | |
def decode_bz2_posts(file_path: str, base_path: str, subreddits: Set[str]): | |
with bz2.BZ2File(f"{base_path}/{file_path}", "rb") as source_file: | |
read_posts(source_file, subreddits) | |
def decode_xz_posts(file_path: str, base_path: str, subreddits: Set[str]): | |
with open(f"{base_path}/{file_path}", "rb") as compressed: | |
with lzma.LZMAFile(compressed) as uncompressed: | |
read_posts(uncompressed, subreddits) | |
def decode_zst_posts(file_path: str, base_path: str, subreddits: Set[str]): | |
zreader = Zreader(f"{base_path}/{file_path}", chunk_size=8192) | |
read_posts(zreader.readlines(), subreddits) | |
def decode_bz2_comments(file_path: str, base_path: str, subreddits: Set[str]): | |
with bz2.BZ2File(f"{base_path}/{file_path}", "rb") as source_file: | |
read_comments(source_file, subreddits) | |
def decode_xz_comments(file_path: str, base_path: str, subreddits: Set[str]): | |
with open(f"{base_path}/{file_path}", "rb") as compressed: | |
with lzma.LZMAFile(compressed) as uncompressed: | |
read_comments(uncompressed, subreddits) | |
def decode_zst_comments(file_path: str, base_path: str, subreddits: Set[str]): | |
zreader = Zreader(f"{base_path}/{file_path}", chunk_size=8192) | |
read_comments(zreader.readlines(), subreddits) | |
def load(file_name: str, base_path: str = "data_in", subreddits: Set[str] = set()): | |
if len(subreddits) == 0: | |
raise ValueError("Subreddits are empty") | |
if "RS_" in file_name: | |
print("Submissions", file_name) | |
if ".bz2" in file_name: | |
decode_bz2_posts(file_name, base_path, subreddits) | |
elif ".xz" in file_name: | |
decode_xz_posts(file_name, base_path, subreddits) | |
elif ".zst" in file_name: | |
decode_zst_posts(file_name, base_path, subreddits) | |
else: | |
print("Unrecognized file name", file_name) | |
elif "RC_" in file_name: | |
print("Comments", file_name) | |
if ".bz2" in file_name: | |
decode_bz2_comments(file_name, base_path, subreddits) | |
elif ".xz" in file_name: | |
decode_xz_comments(file_name, base_path, subreddits) | |
elif ".zst" in file_name: | |
decode_zst_comments(file_name, base_path, subreddits) | |
else: | |
print("Unrecognized file name", file_name) | |
else: | |
print("Unrecognized file name", file_name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment