Created
November 28, 2019 17:18
-
-
Save maksimKorzh/d4d17cfb67c479d2478caceff872f36b to your computer and use it in GitHub Desktop.
A simple and minimalist CLI browser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Libraries | |
import requests | |
from bs4 import BeautifulSoup | |
class Browser: | |
# Make HTTP GET request | |
def fetch(self, url): | |
# Try to fetch URL | |
try: | |
# Obtain HTML response | |
response = requests.get(url) | |
# Return response object | |
return response | |
# Return none if failed | |
except: | |
return None | |
# Parse content and pretty print it to screen | |
def parse(self, html): | |
# Parse the content | |
content = BeautifulSoup(html, 'lxml') | |
# Loop over content tags recursively | |
for tag in content.recursiveChildGenerator(): | |
# If found leaf node tag | |
if tag.name is None: | |
# And it's not empty | |
if not tag.isspace(): | |
# Extract parent tag name | |
parent = tag.parent.name | |
# Ignore 'script', 'style' tags | |
if parent != 'script' and parent != 'style' and tag != 'html': | |
indent = '' | |
# Calculate indentaion for each tag | |
for space in range(0, len([tag for tag in tag.parents])): | |
indent += ' ' | |
# Distinguish links from other tags | |
if parent == 'a': | |
print(indent + '[' + tag.strip() + '] ' + '<' + tag.parent['href'] + '>') | |
else: | |
print(indent + tag.strip()) | |
# Download the file | |
def download(self, filename, response): | |
# Open file stream to write raw bytes | |
with open(filename, 'wb') as download_file: | |
# Loop over respone content | |
for chunk in response.iter_content(chunk_size=128): | |
# Write a chunk of data | |
download_file.write(chunk) | |
# Run browser loop | |
def run(self): | |
while True: | |
# Init user input string | |
user_input = input('browse > ') | |
# Parse user input string | |
try: | |
# Extract user command | |
command = user_input.split()[0] | |
# Extract user URL | |
url = user_input.split()[1] | |
# Case "fetch" command | |
if command == 'fetch': | |
print('Fetching URL: %s' % url) | |
# Fetch user URL | |
response = self.fetch(url) | |
# Parse URL | |
try: | |
self.parse(response.text) | |
except: | |
print('Failed to fetch URL: %s' % url) | |
continue | |
# Case "download" command | |
elif command == 'download': | |
try: | |
# Init user filename | |
filename = user_input.split()[2] | |
# Fetch user URL | |
response = self.fetch(url) | |
# Download data from user URL | |
try: | |
print('Downloading data from URL: %s | PATH: "%s"' % (url, filename)) | |
# Download file | |
self.download(filename, response) | |
print('Done') | |
except: | |
print('Failed to download data from URL: %s' % url) | |
continue | |
except: | |
print('Download URL example: download https://github.com/maksimKorzh/one-time-scrapers/archive/master.zip ots.zip') | |
continue | |
except: | |
if user_input == 'exit': | |
break | |
else: | |
print('Usage:') | |
print('Fetch URL example: fetch https://www.google.com/') | |
print('Download URL example: download https://github.com/maksimKorzh/one-time-scrapers/archive/master.zip ots.zip') | |
if __name__ == '__main__': | |
browser = Browser() | |
browser.run() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment