#!/usr/bin/env python
# -*- coding: UTF-8 -*-
__author__="Scott Hendrickson, Josh Montague"
__license__="Simplified BSD"
import pkg_resources
try:
__version__ = pkg_resources.require("gnacs")[0].version
except pkg_resources.DistributionNotFound:
__version__ = "N/A"
import sys
import codecs
import fileinput
import re
import os
import argparse
from acscsv import *
# needed only for the pretty-printing
import json as json_printer
# use fastest option available for parsing
try:
import ujson as json
except ImportError:
try:
import json
except ImportError:
import simplejson as json
# unicode input
reload(sys)
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
[docs]def gnacs_args():
"""Parse comand line arguemnts for defining input and output of command line utility."""
parser = argparse.ArgumentParser(
description="Parse seqeunce of JSON formated activities.")
parser.add_argument("file_name", metavar= "file_name", nargs="?"
, default=[]
, help="Input file name (optional).")
parser.add_argument("-a","--status", action="store_true", dest="status"
, default=False
, help="Version, status, etc.")
parser.add_argument("-g","--geo", action="store_true", dest="geo"
, default=False
, help="Include geo fields")
parser.add_argument("-i", "--influence", action="store_true", dest="influence"
, default=False
, help="Show user's influence metrics (Twitter only)")
parser.add_argument("-c","--csv", action="store_true", dest="csv"
, default=False
, help="Comma-delimited output (, default is | without quotes)")
parser.add_argument("-l","--lang", action="store_true", dest="lang"
, default=False
, help="Include language fields")
parser.add_argument("-j","--geojson", action="store_true", dest="geojson"
, default=False
, help="Output is geojson format (Foursquare and Twitter only) \
Caution: dataset must fit in memory.")
parser.add_argument("-o","--origin", action="store_true", dest="origin"
, default=False
, help="Include source/origin fields")
parser.add_argument("-p","--pretty", action="store_true", dest="pretty"
, default=False
, help="Pretty JSON output of full records")
parser.add_argument("-s", "--urls", action="store_true", dest="urls"
, default=False
, help="Include urls fields")
parser.add_argument("-t","--structure", action="store_true", dest="struct"
, default=False
, help="Include thread linking fields")
parser.add_argument("-r","--rules", action="store_true", dest="rules"
, default=False
, help="Include rules fields")
parser.add_argument("-u","--user", action="store_true", dest="user"
, default=False
, help="Include user fields")
parser.add_argument("-v","--version", action="store_true", dest="ver"
, default=False
, help="Show version number")
parser.add_argument("-x","--explain", action="store_true", dest="explain",
default=False
, help="Show field names in output for sample input records")
parser.add_argument("-z","--publisher", dest="pub"
, default="twitter"
, help="Publisher (default is twitter), twitter, newsgator, disqus, \
wordpress, wpcomments, tumblr, foursquare, getglue, stocktwits, stocktwits-native")
parser.add_argument("-k","--keypath", dest="keypath"
, default=None
, help="returns a value from a path of the form 'key:value'")
return parser
if __name__ == "__main__":
"""Use gnacs delimited-field parsing libraries as a command line tool to parse a series of JSON
formatted actvities from file, compressed file or standard input (stdin)."""
options = gnacs_args().parse_args()
if options.ver:
print "*"*70
print "Gnacs Version: %s"%__version__
print "Please see https://github.com/DrSkippy27/Gnacs for updates or"
print "sudo pip install gnacs --upgrade to install the latest version."
print "*"*70
sys.exit()
#
delim = "|" # default delimiter
if options.csv:
delim = "," # csv delimiter
elif options.geojson:
options.geo = True
# note: geojson option creates an in-memory structure
sys.stdout.write('{"type": "FeatureCollection", "features": [')
#
if options.pub.lower().startswith("word") or options.pub.lower().startswith("wp"):
processing_obj = wordpress_acs.WPacsCSV(delim
, options.keypath
, options.user
, options.rules
, options.lang
, options.struct
)
elif options.pub.lower().startswith("disq"):
processing_obj = disqus_acs.DiacsCSV(delim
, options.keypath
, options.user
, options.rules
, options.lang
, options.struct
, options.status
)
elif options.pub.lower().startswith("tumb"):
processing_obj = tumblr_acs.TblracsCSV(delim
, options.keypath
, options.user
, options.rules
, options.lang
, options.struct
)
elif options.pub.lower().startswith("four") or options.pub.lower().startswith("fsq"):
processing_obj = foursquare_acs.FsqacsCSV(delim
, options.keypath
, options.geo
, options.user
, options.rules
)
elif options.pub.lower().startswith("get") or options.pub.lower().startswith("gg"):
processing_obj = getglue_acs.GgacsCSV(delim
, options.keypath
, options.user
, options.rules
, options.urls
, options.origin
)
elif options.pub.lower().startswith("st") and options.pub.lower().endswith("native"):
processing_obj = stocktwits_native.StocktwitsNative(delim
, options.keypath
, options.user
, options.struct
, options.influence
)
elif options.pub.lower().startswith("st"):
processing_obj = stocktwits_acs.StacsCSV(delim
, options.keypath
, options.user
, options.struct
, options.influence
)
elif options.pub.lower().startswith("news") or options.pub.lower().startswith("ng"):
processing_obj = newsgator_acs.NGacsCSV(delim
, options.keypath
, options.urls
, options.user
)
else:
processing_obj = twitter_acs.TwacsCSV(delim
, options.keypath
, options.geo
, options.user
, options.rules
, options.urls
, options.lang
, options.influence
, options.struct
)
#
first_geo = True
for line_number, record in processing_obj.file_reader(options.file_name):
if options.pretty:
print json_printer.dumps(record, indent=3, ensure_ascii=False)
continue
try:
if options.explain:
#### TODO: fix -x option for new extractors ####
print >>sys.stderr, "\n****\n\n'explain' functionality currently unavailable\n\n****\n"
sys.exit()
################################################
record = reflect_json.reflect_json(record)
sys.stdout.write("%s\n"%processing_obj.procRecord(record))
elif options.geojson:
# geo-tag coords
geo_rec = processing_obj.asGeoJSON(record)
if geo_rec is not None:
if not first_geo:
sys.stdout.write(",")
sys.stdout.write(json.dumps(geo_rec))
first_geo = False
else:
# ensure formatter is working on a unicode object
sys.stdout.write(u"{}\n".format(processing_obj.procRecord(record, emptyField="None")))
# handle I/O exceptions associated with writing to stdout (e.g. when output is piped to 'head')
# TODO: handle this via contextmanager (within AcsCSV)?
except IOError, e:
try:
sys.stdout.close()
except IOError:
pass
try:
sys.stderr.close()
except IOError:
pass
break
# close the geojson data structure
if options.geojson:
sys.stdout.write(']}\n')