Source code for acscsv.snowflake

#!/usr/bin/env python
import sys
import re
import time
import datetime
import calendar

"""
From twitter code on github
 workerIdBits = 5L
 datacenterIdBits = 5L
 sequenceBits = 12L
 
 maxWorkerId = -1L ^ (-1L << workerIdBits)
 maxDatacenterId = -1L ^ (-1L << datacenterIdBits)

 sequenceMask = -1L ^ (-1L << sequenceBits)
 
 workerIdShift = sequenceBits
 datacenterIdShift = sequenceBits + workerIdBits
 timestampLeftShift = sequenceBits + workerIdBits + datacenterIdBits

 ((timestamp - twepoch) << timestampLeftShift) |
          (datacenterId << datacenterIdShift) |
              (workerId << workerIdShift) | 
               sequence
"""

FMT = "%Y-%m-%dT%H:%M:%S"
TWEPOCH = 1288834974657.

SF_SEQ_BITS = 12
SF_WORK_BITS = 5
SF_DC_BITS = 5
SF_TIME_BITS = 41
SF_BITS = 64

nRE = re.compile("[0-9]{18}")

[docs]class Snowflake(object): """Snowflake id object provides access to multiple forms of of the bit fields present in a snowflake id. Ids can be of the form of a string with leading and trailing characters or symbols. The first 18 consecutive digits will be used as the id. It is also permitted to provide an int:: tag:search.twitter.com,2005:113733024721539072 113733024721539072 113733024721539072|This is the tweet for which you are looking|en where the latter represents a pre-snowflake twitter id. In this case all fields will be null. When the input does not match a valid snowflake id, the input field is available as the id and all other fields are set to None. Available fields:: id sequence worker data_center ts sample_set timestamp timeStruct timeString year month day hour min sec dow doy """ def __init__(self, id): """Create a new snowflake object from string or number represenataion of id.""" ns = nRE.findall(str(id)) if len(ns) > 0: # only process first matching id in string self.id = int(ns[0]) self.sequence = int(self.masked_id(SF_SEQ_BITS, 0)) self.worker = int(self.masked_id(SF_WORK_BITS, SF_SEQ_BITS)) self.data_center = int(self.masked_id(SF_DC_BITS, SF_WORK_BITS + SF_SEQ_BITS)) self.ts = int(self.masked_id(SF_TIME_BITS, SF_DC_BITS + SF_WORK_BITS + SF_SEQ_BITS)) self.sample_set = self.ts % 100 # originally ((self.id >> 22) + TWEPOCH)/1000.0 self.timestamp = (self.ts + TWEPOCH)/1000. self.timeStruct = time.gmtime(self.timestamp) self.timeString = time.strftime(FMT, self.timeStruct) self.year = self.timeStruct.tm_year self.month = self.timeStruct.tm_mon self.day = self.timeStruct.tm_mday self.hour = self.timeStruct.tm_hour self.min = self.timeStruct.tm_min self.sec = self.timeStruct.tm_sec self.dow = self.timeStruct.tm_wday self.doy = self.timeStruct.tm_yday #self.trials = [self.ndigits(self.id, 2) # , self.ndigits(self.ts, 2) # , self.ndigits(self.timestamp, 2)] if len(ns) == 0 or self.year < 2010 or self.year > datetime.datetime.now().year + 1: # no valid snowflake id found self.id = id # pass through input self.sequence = self.worker = self.data_center = self.ts = self.timestamp = None self.timeStruct = self.timeString = self.year = self.month = self.day = None self.hour = self.min = self.sec = self.dow = self.doy = None #def ndigits(self, x, n): #return int(x - (10**n) * int(x/(10**n))) def masked_id(self, bits, pos): # returns an int mask = int('1'*bits, 2) << pos res = (mask & self.id) >> pos #print '%s' % bin(self.id).rjust(65) #print '%s' % bin(mask).rjust(65) #print '%s' % bin(res).rjust(65) return res def get_id_datetime(self): return [self.id, self.timeString] def __repr__(self): res = "#"*15 + "\n" res += "id: %s\n"%self.id res += "seq: %s\n"%self.sequence res += "worker: %s\n"%self.worker res += "DS: %s\n"%self.data_center res += "Seconds: %s\n"%self.timestamp res += "time: %s\n"%self.timeString return res
if __name__ == "__main__": import csv wrtr = csv.writer(sys.stdout) for r in sys.stdin: ns = nRE.findall(r) try: for x in ns: sf = Snowflake(x) wrtr.writerow([sf.id , sf.sequence , sf.worker , sf.data_center , sf.timeString , sf.hour , sf.min , sf.sec]) except IndexError: sys.stderr.write( "ERROR %s"%ns)