osrcd

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import (division, print_function, absolute_import,
                        unicode_literals)

__all__ = ["fetch"]

import gevent
from gevent import monkey
monkey.patch_all()

import re
import os
import glob
import gzip
import json
import time
import shutil
import logging
import requests
from datetime import date, timedelta
from tempfile import NamedTemporaryFile

from osrc.index import rebuild_index
from osrc.database import get_pipeline
from osrc.database import format_key as _format

# Make sure that the directory for the local caching of the data exists.
local_data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              "data")
local_data_dir = os.environ.get("OSRC_DATA_DIR", local_data_dir)
fn_template = os.path.join(local_data_dir,
                           "{year}-{month:02d}-{day:02d}-{n}.json.gz")
try:
    os.makedirs(local_data_dir)
except os.error:
    pass

# The URL template for the GitHub Archive.
archive_url = ("http://data.githubarchive.org/"
               "{year}-{month:02d}-{day:02d}-{n}.json.gz")

# Regular expression for parsing filename formats.
date_re = re.compile(r"([0-9]{4})-([0-9]{2})-([0-9]{2})-([0-9]+)\.json.gz")


def _fetch_one(year, month, day, n):
    kwargs = {"year": year, "month": month, "day": day, "n": n}
    local_fn = fn_template.format(**kwargs)

    # Skip if the file exists.
    if os.path.exists(local_fn):
        return

    # Download the remote file.
    remote = archive_url.format(**kwargs)
    r = requests.get(remote)
    if r.status_code == requests.codes.ok:
        # Atomically write to disk.
        # http://stackoverflow.com/questions/2333872/ \
        #        atomic-writing-to-file-with-python
        f = NamedTemporaryFile("wb", delete=False)
        f.write(r.content)
        f.flush()
        os.fsync(f.fileno())
        f.close()
        shutil.move(f.name, local_fn)


def fetch(year, month, day):
    """
    Asynchronously download all the event archives for one day of activity
    from the GitHub Archive.

    :param year: The 4-digit year of the date.
    :param month: The integer id of the target month ``[1, 12]``.
    :param day: The integer id of the target day ``[1, 31]``.

    """
    jobs = [gevent.spawn(_fetch_one, year, month, day, n) for n in range(24)]
    gevent.joinall(jobs)


def process(filename):
    """
    Process a single gzipped archive file and push the results to the database.

    :param filename: The absolute path to the archive file.

    """
    # Figure out the day of the week from the filename (this is probably not
    # always right but it'll do).
    year, month, day, hour = map(int, date_re.findall(filename)[0])
    weekday = date(year=year, month=month, day=day).strftime("%w")

    # Set up a redis pipeline.
    pipe = get_pipeline()

    # Unzip and load the file.
    strt = time.time()
    count = 0
    with gzip.GzipFile(filename) as f:
        # One event per line.
        events = [line.decode("utf-8", errors="ignore") for line in f]
        count = len(events)

        # One event per line.
        for n, line in enumerate(events):
            # Parse the JSON of this event.
            try:
                event = json.loads(line)
            except:
                logging.warn("Failed on line {0} of {1}-{2:02d}-{3:02d}-{4}"
                             .format(n, year, month, day, hour))
                continue

            # Get the user involved and skip if there isn't one.
            actor = event["actor"]
            attrs = event.get("actor_attributes", {})
            if actor is None or attrs.get("type") != "User":
                # This was probably an anonymous event (like a gist event)
                # or an organization event.
                continue

            # Normalize the user name.
            key = actor.lower()

            # Get the type of event.
            evttype = event["type"]
            nevents = 1

            # Can this be called a "contribution"?
            contribution = evttype in ["IssuesEvent", "PullRequestEvent",
                                       "PushEvent"]

            # Increment the global sum histograms.
            pipe.incr(_format("total"), nevents)
            pipe.hincrby(_format("day"), weekday, nevents)
            pipe.hincrby(_format("hour"), hour, nevents)
            pipe.zincrby(_format("user"), key, nevents)
            pipe.zincrby(_format("event"), evttype, nevents)

            # Event histograms.
            pipe.hincrby(_format("event:{0}:day".format(evttype)), weekday,
                         nevents)
            pipe.hincrby(_format("event:{0}:hour".format(evttype)), hour,
                         nevents)

            # User schedule histograms.
            pipe.hincrby(_format("user:{0}:day".format(key)), weekday, nevents)
            pipe.hincrby(_format("user:{0}:hour".format(key)), hour, nevents)

            # User event type histogram.
            pipe.zincrby(_format("user:{0}:event".format(key)), evttype,
                         nevents)
            pipe.hincrby(_format("user:{0}:event:{1}:day".format(key,
                                                                 evttype)),
                         weekday, nevents)
            pipe.hincrby(_format("user:{0}:event:{1}:hour".format(key,
                                                                  evttype)),
                         hour, nevents)

            # Parse the name and owner of the affected repository.
            repo = event.get("repository", {})
            owner, name, org = (repo.get("owner"), repo.get("name"),
                                repo.get("organization"))
            if owner and name:
                repo_name = "{0}/{1}".format(owner, name)
                pipe.zincrby(_format("repo"), repo_name, nevents)

                # Save the social graph.
                pipe.zincrby(_format("social:user:{0}".format(key)),
                             repo_name, nevents)
                pipe.zincrby(_format("social:repo:{0}".format(repo_name)),
                             key, nevents)

                # Do we know what the language of the repository is?
                language = repo.get("language")
                if language:
                    # Which are the most popular languages?
                    pipe.zincrby(_format("lang"), language, nevents)

                    # Total number of pushes.
                    if evttype == "PushEvent":
                        pipe.zincrby(_format("pushes:lang"), language, nevents)

                    pipe.zincrby(_format("user:{0}:lang".format(key)),
                                 language, nevents)

                    # Who are the most important users of a language?
                    if contribution:
                        pipe.zincrby(_format("lang:{0}:user".format(language)),
                                     key, nevents)

        pipe.execute()

        logging.info("Processed {0} events in {1} [{2:.2f} seconds]"
                     .format(count, filename, time.time() - strt))


def fetch_and_process(year, month, day):
    logging.info("Processing data for {0:04d}-{1:02d}-{2:02d}"
                 .format(year, month, day))
    fetch(year, month, day)
    kwargs = {"year": year, "month": month, "day": day, "n": "*"}
    filenames = glob.glob(fn_template.format(**kwargs))
    if len(filenames) != 24:
        logging.warn("Missing {0} archive files for date "
                     "{1:04d}-{2:02d}-{3:02d}"
                     .format(24 - len(filenames), year, month, day))
    map(process, filenames)


if __name__ == "__main__":
    import argparse
    from osrc import create_app

    today = date.today()

    # Parse the command line arguments.
    parser = argparse.ArgumentParser(description="Monitor GitHub activity.")
    parser.add_argument("--since", default=None, help="The starting date.")
    parser.add_argument("--config", default=None,
                        help="The path to the local configuration file.")
    parser.add_argument("--log", default=None,
                        help="The path to the log file.")
    args = parser.parse_args()

    largs = dict(level=logging.INFO,
                 format="[%(asctime)s] %(name)s:%(levelname)s:%(message)s")
    if args.log is not None:
        largs["filename"] = args.log
    logging.basicConfig(**largs)

    # Initialize a flask app.
    app = create_app(args.config)

    # Set up the app in a request context.
    with app.test_request_context():
        if args.since is not None:
            day = date(**dict(zip(["year", "month", "day"],
                                  map(int, args.since.split("-")))))
            while day < today:
                fetch_and_process(day.year, day.month, day.day)
                day += timedelta(1)

        else:
            yesterday = today - timedelta(1)
            fetch_and_process(yesterday.year, yesterday.month, yesterday.day)

        logging.info("Rebuilding index.")
        rebuild_index()
        logging.info("Finished.")