Created
September 26, 2025 22:02
-
-
Save Sonictherocketman/606fcded0e511b92224c6deee86faa24 to your computer and use it in GitHub Desktop.
A script to parse Caddy files for RSS feed subscribers.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env python3 | |
| import argparse | |
| from collections import Counter, defaultdict | |
| import json | |
| import re | |
| import sys | |
| FEED_URIS = ( | |
| '/rss.xml', | |
| '/feed.json', | |
| ) | |
| BROWSERS = ( | |
| 'iPhone', | |
| 'iPad', | |
| 'Android', | |
| 'Linux', | |
| 'Windows NT', | |
| 'Macintosh', | |
| ) | |
| # Services which are either self-hosted or desktop that do not report | |
| # subscriber counts, so we estimate by unique IPs. These are processed | |
| # in order so more general user agents (browser versions, etc) go last. | |
| INDIVIDUAL_SERVICES = ( | |
| 'NetNewsWire', | |
| 'Unread', | |
| 'Tiny Tiny RSS', | |
| 'kurtmckee/feedparser', | |
| 'Googlebot', | |
| 'YandexBot', | |
| 'FreshRSS', | |
| 'Reeder', | |
| 'Miniflux', | |
| 'UniversalFeedParser', | |
| 'curl', | |
| 'SimplePie', | |
| 'FeedsFun', | |
| 'Twingly', | |
| 'RSS.Social', | |
| 'SpaceCowboys', | |
| 'FreedomController', | |
| 'Newsflash', | |
| 'Yarr', | |
| 'Micro.blog', | |
| 'fluent-reader', | |
| 'FeedDiscovery', | |
| ) | |
| FRIENDLY_SERVICES = { | |
| 'NewsBlur Feed Fetcher': 'NewsBlur Feed Fetcher - ([0-9]+) subscribers', | |
| 'Inoreader': 'Inoreader/[0-9\.]+ \(\+http://www.inoreader.com/feed-fetcher; ([0-9]+) subscribers; \)', | |
| 'Feedbin': 'Feedbin feed-id:[0-9]+ - ([0-9]+) subscribers', | |
| 'Feedly': 'Feedly/[0-9\.]+ \(\+http://www.feedly.com/fetcher.html; ([0-9]+) subscribers; \)', | |
| 'BazQux': 'Mozilla/5.0 \(compatible; BazQux/[0-9\.]+; +https://bazqux.com/fetcher; ([0-9]+) subscribers\)', | |
| 'TheOldReader': 'Mozilla/[0-9\.]+ \(compatible; theoldreader.com; .* ([0-9]+) subscribers; feed-id=(.*)', | |
| } | |
| BOTS = { | |
| 'Newslitbot', | |
| 'Googlebot', | |
| 'athena-spider', | |
| 'rawweb-bot', | |
| 'YandexBot', | |
| 'amazonbot', | |
| 'dataforseo-bot', | |
| 'kagibot', | |
| 'Go-http-client/', | |
| 'applebot', | |
| 'trafilatura', | |
| 'DnBCrawler-Analytics', | |
| 'AhrefsBot', | |
| 'Xobaque', | |
| 'rss2tg_crawler', | |
| 'ScourRSSBot', | |
| 'sumi.news', | |
| 'Buck/', | |
| 'ifastnet.com', | |
| 'Rome Client', | |
| 'hackney/1.24.1', | |
| 'Apache-HttpClient', | |
| 'yakread.com', | |
| } | |
| def parse_args(): | |
| parser = argparse.ArgumentParser('Collect info about RSS feed subs') | |
| parser.add_argument('input', type=argparse.FileType('r'), default=sys.stdin) | |
| parser.add_argument('--include-browsers', action='store_true', default=False) | |
| return parser.parse_args() | |
| def main(): | |
| args = parse_args() | |
| counts = defaultdict(int) | |
| for line in args.input: | |
| data = json.loads(line) | |
| uri = data['request']['uri'] | |
| if uri not in FEED_URIS: | |
| continue | |
| ip = data['request']['remote_ip'] | |
| user_agent = data['request']['headers'].get('User-Agent', ['none'])[0] | |
| if any(bot for bot in BOTS if bot in user_agent): | |
| # Ignore bots | |
| continue | |
| # Try Cloud Services | |
| try: | |
| service = [ | |
| service for service in FRIENDLY_SERVICES.keys() | |
| if service.lower() in user_agent.lower() | |
| ][0] | |
| except IndexError: | |
| # This URL was not a friendly service. Try another method. | |
| pass | |
| else: | |
| regex = FRIENDLY_SERVICES[service] | |
| match = re.search(regex, user_agent) | |
| if match: | |
| try: | |
| key = (service, uri, match.group(2)) | |
| except IndexError: | |
| key = (service, uri, '') | |
| counts[key] = int(match.group(1)) | |
| continue | |
| # Try Known Self-Hosted | |
| try: | |
| service = [ | |
| service for service in INDIVIDUAL_SERVICES | |
| if service.lower() in user_agent.lower() | |
| ][0] | |
| except IndexError: | |
| pass | |
| else: | |
| key = (service, uri, ip) | |
| counts[key] = 1 | |
| continue | |
| # Try Others | |
| try: | |
| service = [ | |
| service for service in BROWSERS | |
| if service.lower() in user_agent.lower() | |
| ][0] | |
| is_browser = True | |
| except IndexError: | |
| is_browser = False | |
| if args.include_browsers and is_browser: | |
| key = (service, uri, ip) | |
| counts[key] = 1 | |
| elif not args.include_browsers and not is_browser: | |
| key = (user_agent, uri, ip) | |
| counts[key] = 1 | |
| aggregated = defaultdict(int) | |
| for key, count in counts.items(): | |
| agent, uri, _ = key | |
| aggregated[agent] += count | |
| counter = Counter(aggregated) | |
| print('Results:') | |
| print('-----------------------') | |
| print(f'Total Subscribers: {counter.total()}') | |
| print('-----------------------') | |
| print('Subs.\tUser-Agent') | |
| print('-----------------------') | |
| for item, count in counter.most_common(): | |
| print(f'{count}\t{item}') | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment