crawl_network.py

import queue
import os
import tweepy
import logging
import argparse
import sys

from sqlalchemy import (create_engine, Column, String, BigInteger, Integer,
                        Boolean, ForeignKey, DateTime)
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, scoped_session, relationship
from datetime import datetime

from manager.streamer import JSONStreamer

logging.basicConfig(
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt='%m/%d/%Y %I:%M:%S %p',
    level=logging.INFO)
logger = logging.getLogger(__name__)

DEFAULT_GRAPH_FILENAME = 'crawled_network.gexf'
DEFAULT_RAW_RESULTS_FILENAME = 'crawled_network.json.gz'
DEFAULT_CRAWL_DEGREE = 1
DEFAULT_MAX_CONNECTIONS = 25000

Base = declarative_base()


class Node(Base):
    """A class representing a node in a graph.
    """
    __tablename__ = 'nodes'
    id = Column(BigInteger, primary_key=True)
    screen_name = Column(String(255))
    is_root = Column(Boolean)
    created_date = Column(DateTime, default=datetime.now)


class Edge(Base):
    """A class representing a directed edge in a graph.
    """
    __tablename__ = 'edges'
    id = Column(Integer, primary_key=True)
    source_id = Column(Integer, ForeignKey('nodes.id'))
    target_id = Column(Integer, ForeignKey('nodes.id'))
    source = relationship(
        Node, primaryjoin=source_id == Node.id, backref="friends")
    target = relationship(
        Node, primaryjoin=target_id == Node.id, backref="followers")
    created_date = Column(DateTime, default=datetime.now)


def lookup_users(api, user_ids):
    results = []
    lookup = []
    for user_id in user_ids:
        lookup.append(user_id)
        if len(lookup) == 100:
            try:
                response = api.lookup_users(
                    user_ids=lookup, include_entities=True)
                results.extend([result._json for result in response])
            except Exception as e:
                logger.error('Error looking up users: {}'.format(e))
            lookup = []
    if len(lookup):
        try:
            response = api.lookup_users(user_ids=lookup, include_entities=True)
            results.extend([result._json for result in response])
        except Exception as e:
            logger.error('Error looking up users: {}'.format(e))
    return results


def get_friends(api, screen_name, max_connections=0):
    friends = []
    max_connections_reached = False
    try:
        for friend_ids in tweepy.Cursor(
                api.friends_ids, screen_name=screen_name).pages():
            if max_connections and (
                    len(friends) + len(friend_ids)) > max_connections:
                logger.info(
                    'Max connections reached... trimming final request')
                friend_ids = friend_ids[:max_connections - len(friends)]
                max_connections_reached = True
            friends.extend(lookup_users(api, friend_ids))
            if max_connections_reached:
                break
    except Exception as e:
        logger.error('Error fetching friends: {}'.format(e))
    return friends


def get_followers(api, screen_name, max_connections=0):
    followers = []
    max_connections_reached = False
    try:
        for follower_ids in tweepy.Cursor(
                api.followers_ids, screen_name=screen_name).pages():
            if max_connections and (
                    len(followers) + len(follower_ids)) > max_connections:
                logger.info(
                    'Max connections reached... trimming final request')
                follower_ids = follower_ids[:max_connections - len(followers)]
                max_connections_reached = True
            followers.extend(lookup_users(api, follower_ids))
            if max_connections_reached:
                break
    except Exception as e:
        logger.error('Error fetching friends: {}'.format(e))
    return followers


def parse_args():
    """Parses the command line arguments.
    """
    parser = argparse.ArgumentParser(
        description='Crawl a Twitter user\'s social network')
    parser.add_argument('user', type=str, help='User screen name to crawl')
    parser.add_argument(
        '--graph-file',
        '-g',
        type=str,
        help='Filename for the output GEXF graph',
        default=DEFAULT_GRAPH_FILENAME)
    parser.add_argument(
        '--raw',
        '-r',
        type=str,
        help='Filename for the raw JSON data',
        default=DEFAULT_RAW_RESULTS_FILENAME)
    parser.add_argument(
        '--degree',
        '-d',
        type=int,
        help='Max degree of crawl',
        default=DEFAULT_CRAWL_DEGREE)
    parser.add_argument(
        '--max-connections',
        '-c',
        type=int,
        help='Max number of connections per account to crawl',
        default=DEFAULT_MAX_CONNECTIONS)
    parser.add_argument(
        '--root-connections',
        '-rc',
        help='Only track connections connected to the original user',
        default=False,
        action='store_true')
    parser.add_argument(
        '--dynamic',
        help='Store the results as a dynamic graph instead of a static graph',
        default=False,
        action='store_true')
    return parser.parse_args()


def write_graph(session, args):
    """Writes the entries in the database to a GEXF file

    Arguments:
        session {sqlalchemy.orm.Session} -- The database session
        graph_filename {str} -- The filename to write the graph to
    """

    with open(args.graph_file, 'w') as graph:
        graph_mode = 'mode="static"'
        if args.dynamic:
            graph_mode = 'mode="dynamic" timeformat="dateTime"'
        graph.write(
            "<?xml version='1.0' encoding='utf-8'?>"
            "<gexf version=\"1.2\" xmlns=\"http://www.gexf.net/1.2draft\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.w3.org/2001/XMLSchema-instance\">"
            "<graph defaultedgetype=\"directed\" {} name=\"\">\n".format(
                graph_mode))

        # Write the nodes
        graph.write('<nodes>\n')
        for node in session.query(Node).yield_per(1000):
            graph.write('<node id="{}" label="{}" start="{}"/>\n'.format(
                node.screen_name, node.screen_name,
                node.created_date.strftime('%Y-%m-%dT%H:%M:%S')))
        graph.write('</nodes>\n')

        graph.write('<edges>\n')

        query = session.query(Edge)
        for edge in query.yield_per(1000):
            graph.write(
                '<edge id="{}" source="{}" target="{}" start="{}"/>\n'.format(
                    edge.id, edge.source.screen_name, edge.target.screen_name,
                    edge.created_date.strftime('%Y-%m-%dT%H:%M:%S')))
        graph.write('</edges>\n')
        graph.write('</graph>\n')
        graph.write('</gexf>\n')


def add_node(session, id, screen_name, args, is_root=False):
    """Adds a new node to the database

    Arguments:
        session {sqlalchemy.orm.Session} -- The database session
        id {int} -- The Twitter account ID
        screen_name {str} -- The Twitter screen name
        is_root {bool} -- Whether or not this is a root connection
    """
    node = session.query(Node).get(id)
    if not node:
        node = Node(id=id, screen_name=screen_name, is_root=is_root)
        # If we only care about root connections, and this isn't one of them,
        # then we can avoid adding it to the database since it won't have an
        # edge anyway.
        if args.root_connections and not is_root:
            session.commit()
            return node
        session.add(node)
    session.commit()
    return node


def add_edge(session, source, target, args):
    """Adds a new edge to the database

    Arguments:
        session {sqlalchemy.orm.Session} -- The database session
        source {Node} -- The source node for the edge
        target {Node} -- The target node for the edge
        args {argparse.Arguments} -- The command line arguments provided
    """
    # Check if this edge already exists
    edge = session.query(Edge).filter(Edge.source_id == source.id,
                                      Edge.target_id == target.id).first()
    if edge:
        session.commit()
        return

    # Otherwise, create and return a new edge
    if args.root_connections and (source.is_root and target.is_root):
        edge = Edge(source=source, target=target)
        session.add(edge)
    elif not args.root_connections:
        edge = Edge(source=source, target=target)
        session.add(edge)
    session.commit()


def main():
    args = parse_args()
    consumer_key = os.environ.get('TWEEPY_CONSUMER_KEY')
    consumer_secret = os.environ.get('TWEEPY_CONSUMER_SECRET')
    access_token = os.environ.get('TWEEPY_ACCESS_TOKEN')
    access_token_secret = os.environ.get('TWEEPY_ACCESS_TOKEN_SECRET')

    if not (consumer_key and consumer_secret and access_token
            and access_token_secret):
        logger.error('Need to specify the OAuth configuration.')
        sys.exit(1)

    user_auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    user_auth.set_access_token(access_token, access_token_secret)

    api = tweepy.API(
        user_auth, wait_on_rate_limit_notify=True, wait_on_rate_limit=True)

    # Set up the database
    database_path = '{}.db'.format(args.user)
    engine = create_engine('sqlite:///{}'.format(database_path))
    session_factory = sessionmaker(bind=engine)
    session = scoped_session(session_factory)
    Base.metadata.create_all(engine)

    streamer = JSONStreamer(args.raw)

    seen_accounts = {}

    crawl_queue = queue.Queue()
    current_count = 0

    # Add the initial user to crawl
    try:
        user = api.get_user(screen_name=args.user)
    except Exception as e:
        logger.error('Failed to get user {}'.format(args.user))
        logger.error(e)
        sys.exit(1)
    crawl_queue.put_nowait((0, user._json))

    try:
        while not crawl_queue.empty():
            degree, account = crawl_queue.get()
            degree += 1
            current_count += 1
            screen_name = account['screen_name']
            logger.info(
                'Fetching network for: {} (current count: {} queue size: {})'.
                format(screen_name, current_count, crawl_queue.qsize()))

            account['friends'] = get_friends(
                api, screen_name, max_connections=args.max_connections)
            logger.info('\tFound {} friends for {}'.format(
                len(account['friends']), screen_name))

            account['followers'] = get_followers(
                api, screen_name, max_connections=args.max_connections)
            logger.info('\tFound {} followers for {}'.format(
                len(account['followers']), screen_name))

            streamer.write_row(account)
            streamer.flush()

            is_root = False
            if degree == 1:
                is_root = True

            # Get or create the node for the account we're crawling
            account_node = add_node(
                session,
                account['id'],
                account['screen_name'],
                args,
                is_root=is_root)

            for friend in account['friends']:
                friend_node = add_node(
                    session,
                    friend['id'],
                    friend['screen_name'],
                    args,
                    is_root=is_root)
                add_edge(session, account_node, friend_node, args)
                if degree > args.degree:
                    continue
                if friend['id'] in seen_accounts:
                    continue
                seen_accounts[friend['id']] = True
                crawl_queue.put_nowait((degree, friend))

            for follower in account['followers']:
                follower_node = add_node(
                    session,
                    follower['id'],
                    follower['screen_name'],
                    args,
                    is_root=is_root)
                add_edge(session, follower_node, account_node, args)
                if degree > args.degree:
                    continue
                if follower['id'] in seen_accounts:
                    continue
                seen_accounts[follower['id']] = True
                crawl_queue.put_nowait((degree, follower))

            # I can't explain why these cause a memory leak if they aren't
            # explicitly pop'd. References to `account` should be removed
            # on the next iteration, but that doesn't appear to happen.
            account.pop('friends')
            account.pop('followers')

    except KeyboardInterrupt:
        print('CTRL+C received... shutting down')

    write_graph(session, args)
    os.remove(database_path)

    streamer.close()


if __name__ == '__main__':
    main()