Skip to content

Instantly share code, notes, and snippets.

@punchagan
Last active August 21, 2024 08:02

Revisions

  1. punchagan revised this gist Dec 14, 2013. 1 changed file with 17 additions and 8 deletions.
    25 changes: 17 additions & 8 deletions scrape_google_groups.py
    Original file line number Diff line number Diff line change
    @@ -20,6 +20,7 @@ def __init__(self, url, verbose=False, persistence_file='group.json'):
    self.thread_urls = []
    self.raw_urls = []
    self._current_thread_index = -1
    self._current_message_index = -1
    self._get_state()

    #### GoogleGroupsScraper interface ########################################
    @@ -59,13 +60,16 @@ def get_all_raw_urls(self):
    def save_all_posts(self):
    """ Save all the posts to a persist directory. """

    raw_urls = self.get_all_raw_urls()
    self.get_all_raw_urls()

    messages = [
    self._save_content_of_messages(url) for url in raw_urls
    ]

    return messages
    for i, url in enumerate(self.raw_urls):
    if i <= self._current_message_index:
    continue
    if self.verbose:
    print 'Saving message %d of %d' % (i, len(self.raw_urls))
    self._save_content_of_messages(url)
    self._current_message_index = i
    self._set_state()

    #### Private interface ####################################################

    @@ -150,11 +154,16 @@ def _get_state(self):
    'current_thread_index', -1
    )

    self._current_message_index = data.get(
    'current_message_index', -1
    )

    def _set_state(self):
    """ Save the state to the persistence file. """

    # fixme: persist everything to separate files!
    data = {
    'current_thread_index': self._current_thread_index,
    'current_message_index': self._current_message_index,
    'thread_urls': self.thread_urls,
    'raw_urls': self.raw_urls,
    }
    @@ -222,7 +231,7 @@ def _save_content_of_messages(self, url):
    makedirs(dir_)

    with open(file_path, 'w') as f:
    f.write(message)
    f.write(message.encode('utf8'))

    return message

  2. punchagan revised this gist Dec 13, 2013. 1 changed file with 36 additions and 14 deletions.
    50 changes: 36 additions & 14 deletions scrape_google_groups.py
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,4 @@
    import json
    from os.path import exists
    from selenium import webdriver
    from selenium.webdriver.support.ui import WebDriverWait
    @@ -9,13 +10,17 @@ class GoogleGroupsScraper(object):

    #### object interface #####################################################

    def __init__(self, url, verbose=False):
    def __init__(self, url, verbose=False, persistence_file='group.json'):
    self.url = url
    self.driver = self._get_driver()
    self.wait = WebDriverWait(self.driver, 30)
    self.verbose = verbose

    self.thread_urls = self._get_persisted_urls()
    self.persistence_file = persistence_file
    self.thread_urls = []
    self.raw_urls = []
    self._current_thread_index = -1
    self._get_state()

    #### GoogleGroupsScraper interface ########################################

    @@ -33,19 +38,23 @@ def get_all_thread_urls(self):
    if self.verbose:
    print 'Found %d threads.' % len(self.thread_urls)

    self._set_state()

    def get_all_raw_urls(self):
    """ Return all the raw urls in the forum. """

    self.get_all_thread_urls()

    raw_urls = []

    for i, url in enumerate(self.thread_urls):
    if i <= self._current_thread_index:
    continue
    if self.verbose:
    print 'Fetching raw urls in thread: %d' % i
    raw_urls.extend(self._get_all_raw_urls_in_thread(url))
    self.raw_urls.extend(self._get_all_raw_urls_in_thread(url))
    self._current_thread_index = i
    self._set_state()

    return raw_urls
    return self.raw_urls

    def save_all_posts(self):
    """ Save all the posts to a persist directory. """
    @@ -128,19 +137,32 @@ def _get_last_post(self):

    return last_post

    def _get_persisted_urls(self):
    def _get_state(self):
    """ Return the persisted urls of a post, from a previous run. """

    persisted_file = 'urls.txt'
    if exists(self.persistence_file):
    with open(self.persistence_file) as f:
    data = json.load(f)
    for attr in ['raw_urls', 'thread_urls']:
    setattr(self, attr, data.get(attr, []))

    if exists(persisted_file):
    with open(persisted_file) as f:
    urls = f.read().splitlines()
    self._current_thread_index = data.get(
    'current_thread_index', -1
    )

    else:
    urls = []
    def _set_state(self):
    """ Save the state to the persistence file. """

    return urls
    data = {
    'current_thread_index': self._current_thread_index,
    'thread_urls': self.thread_urls,
    'raw_urls': self.raw_urls,
    }

    with open(self.persistence_file, 'w') as f:
    if self.verbose:
    print 'Saving state ...'
    json.dump(data, f, indent=2)

    def _get_post_list(self):
    """ Get the list of posts currently visible in a groups page. """
  3. punchagan created this gist Dec 13, 2013.
    237 changes: 237 additions & 0 deletions scrape_google_groups.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,237 @@
    from os.path import exists
    from selenium import webdriver
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.common.exceptions import TimeoutException


    class GoogleGroupsScraper(object):
    """ A simple class to scrape a google group. """

    #### object interface #####################################################

    def __init__(self, url, verbose=False):
    self.url = url
    self.driver = self._get_driver()
    self.wait = WebDriverWait(self.driver, 30)
    self.verbose = verbose

    self.thread_urls = self._get_persisted_urls()

    #### GoogleGroupsScraper interface ########################################

    def get_all_thread_urls(self):
    """ Return and persist the urls for all the threads. """

    if len(self.thread_urls) == 0:
    self.driver.get(self.url)
    post_list = self._scroll_to_get_all_posts()
    self.thread_urls = self._get_urls_from_post_list(post_list)

    else:
    print 'Using persisted urls ...'

    if self.verbose:
    print 'Found %d threads.' % len(self.thread_urls)

    def get_all_raw_urls(self):
    """ Return all the raw urls in the forum. """

    self.get_all_thread_urls()

    raw_urls = []

    for i, url in enumerate(self.thread_urls):
    if self.verbose:
    print 'Fetching raw urls in thread: %d' % i
    raw_urls.extend(self._get_all_raw_urls_in_thread(url))

    return raw_urls

    def save_all_posts(self):
    """ Save all the posts to a persist directory. """

    raw_urls = self.get_all_raw_urls()

    messages = [
    self._save_content_of_messages(url) for url in raw_urls
    ]

    return messages

    #### Private interface ####################################################

    def _get_driver(self):
    """ Get the web-driver for the scraper. """

    driver = webdriver.Firefox()
    driver.implicitly_wait(30)

    return driver

    def _get_all_raw_urls_in_thread(self, thread_url):
    """ Return the raw urls of all the messages in the given thread. """

    self.driver.get(thread_url)

    # fixme: see if javascript finished loading...
    try:
    WebDriverWait(self.driver, 3).until(lambda d: False)
    except TimeoutException:
    pass

    message_ids = self._get_all_message_ids()

    raw_urls = [
    self._get_raw_url(thread_url, message_id)
    for message_id in message_ids
    ]

    if self.verbose:
    print 'Obtained %s raw urls.' % len(raw_urls)

    return raw_urls

    def _get_all_message_buttons(self):
    """ Return all the message buttons on the page. """

    timeline = self.driver.find_element_by_id('tm-tl')
    all_buttons = timeline.find_elements_by_class_name(
    'jfk-button-standard'
    )

    return all_buttons

    def _get_all_message_ids(self):
    """ Return all the message ids given a timeline with list of messages.
    """

    all_buttons = self._get_all_message_buttons()
    message_buttons = [
    el for el in all_buttons
    if el.get_attribute('aria-label').startswith('More')
    ]
    message_ids = [
    button.get_attribute('id')[len('b_action_'):]
    for button in message_buttons
    ]

    return message_ids

    def _get_last_post(self):
    """ Get the currently displayed last post. """

    post_list = self._get_post_list()
    last_post = post_list.find_elements_by_class_name('GIURNSTDIQ')[-1]
    # Hack to scroll to the last post
    last_post.location_once_scrolled_into_view

    return last_post

    def _get_persisted_urls(self):
    """ Return the persisted urls of a post, from a previous run. """

    persisted_file = 'urls.txt'

    if exists(persisted_file):
    with open(persisted_file) as f:
    urls = f.read().splitlines()

    else:
    urls = []

    return urls

    def _get_post_list(self):
    """ Get the list of posts currently visible in a groups page. """

    return self.driver.find_element_by_class_name('GIURNSTDGBC')

    def _get_raw_url(self, thread_url, message_id):
    """ Return the raw url given the thread_url and the message_id. """

    _, group, thread_id = thread_url.rsplit('/', 2)
    url_fmt = 'https://groups.google.com/forum/message/raw?msg=%s/%s/%s'

    return url_fmt % (group, thread_id, message_id)

    def _get_urls_from_post_list(self, post_list):
    """ Given a post_list element, return the urls of all the posts. """

    print 'Fetching post urls from all the displayed posts ...'
    urls = [
    el.get_attribute('href')
    for el in post_list.find_elements_by_tag_name('a')
    ]

    urls = [
    url for url in urls
    if url and url.startswith('https://groups.google.com/forum/')
    ]

    with open('urls.txt', 'w') as f:
    f.writeline('%s' % '\n'.join(urls))

    return urls

    def _save_content_of_messages(self, url):
    """ Save the content in the raw url provided.
    Persists the message to forum_name/thread_id/message_id. Return the
    content of the message for convenience.
    """

    import requests
    from urlparse import urlsplit
    from os import makedirs
    from os.path import dirname, sep

    message = requests.get(url).text

    query = urlsplit(url).query
    query = dict([params.split('=') for params in query.split('&')])
    path = query['msg']

    file_path = path.replace('/', sep)
    dir_ = dirname(file_path)

    if not exists(dir_):
    makedirs(dir_)

    with open(file_path, 'w') as f:
    f.write(message)

    return message

    def _scroll_to_get_all_posts(self):
    """ Scroll the page until all the posts get displayed.
    Caution: Quite hackish!
    """

    print 'Scrolling until all the posts are visible ...'

    while True:
    if self.verbose:
    print 'scrolling...'

    last_post = self._get_last_post()

    def new_post_fetched(d):
    new_post = self._get_last_post()
    return last_post.text != new_post.text

    try:
    self.wait.until(lambda d: new_post_fetched(d))
    except TimeoutException:
    print 'Found all posts.'
    break

    return self._get_post_list()

    if __name__ == "__main__":
    forum_url = 'https://groups.google.com/forum/#!forum/mumbai-ultimate'
    scraper = GoogleGroupsScraper(forum_url, verbose=True)
    scraper.save_all_posts()