Import datafreeze source code.

This commit is contained in:
Friedrich Lindenberg 2013-04-04 23:05:19 +02:00
parent 4d9ecb2532
commit c2d606bc6f
11 changed files with 419 additions and 3 deletions

View File

37
dataset/freeze/app.py Normal file
View File

@ -0,0 +1,37 @@
import logging
import argparse
from dataset.util import FreezeException
from dataset.freeze.config import Configuration
from dataset.freeze.engine import ExportEngine
from dataset.freeze.format import get_serializer
logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)
parser = argparse.ArgumentParser(
description='Generate static JSON and CSV extracts from a SQL database.',
epilog='For further information, please check the documentation.')
parser.add_argument('config', metavar='CONFIG', type=str,
help='freeze file cofiguration')
def main():
try:
args = parser.parse_args()
config = Configuration(args.config)
for export in config.exports:
if export.skip:
log.info("Skipping: %s", export.name)
continue
log.info("Running: %s", export.name)
engine = ExportEngine(export)
query = engine.query()
serializer_cls = get_serializer(export)
serializer = serializer_cls(engine)
serializer.serialize()
except FreezeException, fe:
log.error(fe)
if __name__ == '__main__':
main()

82
dataset/freeze/config.py Normal file
View File

@ -0,0 +1,82 @@
import json
import yaml
TRUISH = ['true', 'yes', '1', 'on']
DECODER = {
'json': json,
'yaml': yaml
}
def merge_overlay(data, overlay):
out = overlay.copy()
for k, v in data.items():
if isinstance(v, dict) and isinstance(out.get(k), dict):
v = merge_overlay(v, out.get(k))
out[k] = v
return out
class Configuration(object):
def __init__(self, file_name):
self.file_name = file_name
extension = file_name.rsplit('.', 1)[-1]
loader = DECODER.get(extension, json)
try:
fh = open(file_name, 'rb')
try:
self.data = loader.load(fh)
except ValueError, ve:
raise FreezeException("Invalid freeze file: %s" % ve)
fh.close()
except IOError, ioe:
raise FreezeException(unicode(ioe))
@property
def exports(self):
if not isinstance(self.data, dict):
raise FreezeException("The root element of the freeze file needs to be a hash")
if not isinstance(self.data.get('exports'), list):
raise FreezeException("The freeze file needs to have a list of exports")
common = self.data.get('common', {})
for export in self.data.get('exports'):
yield Export(common, export)
class Export(object):
def __init__(self, common, data):
self.data = merge_overlay(data, common)
def get(self, name, default=None):
return self.data.get(name, default)
def get_normalized(self, name, default=None):
value = self.get(name, default=default)
if not value in [None, default]:
value = unicode(value).lower().strip()
return value
def get_bool(self, name, default=False):
value = self.get_normalized(name)
if value is None:
return default
return value in TRUISH
def get_int(self, name, default=None):
value = self.get_normalized(name)
if value is None:
return default
return int(value)
@property
def skip(self):
return self.get_bool('skip')
@property
def name(self):
return self.get('name', self.get('query'))

42
dataset/freeze/engine.py Normal file
View File

@ -0,0 +1,42 @@
from sqlalchemy import create_engine
from sqlalchemy.exc import ProgrammingError
from dataset.util import FreezeException
class Query(object):
def __init__(self, query, rp):
self.query = query
self.rp = rp
def __len__(self):
return self.rp.rowcount
def __iter__(self):
keys = self.rp.keys()
while True:
row = self.rp.fetchone()
if row is None:
return
yield dict(zip(keys, row))
class ExportEngine(object):
def __init__(self, config):
self.config = config
@property
def engine(self):
if not hasattr(self, '_engine'):
self._engine = create_engine(self.config.get('database'))
return self._engine
def query(self):
try:
q = self.config.get('query')
rp = self.engine.execute(q)
return Query(q, rp)
except ProgrammingError, pe:
raise FreezeException("Invalid query: %s - %s" % (q, pe))

View File

@ -0,0 +1,14 @@
from dataset.freeze.format.fjson import JSONSerializer
from dataset.freeze.format.fcsv import CSVSerializer
from dataset.freeze.format.ftabson import TabsonSerializer
SERIALIZERS = {
'json': JSONSerializer,
'csv': CSVSerializer,
'tabson': TabsonSerializer
}
def get_serializer(config):
serializer = config.get_normalized('format', 'json')
return SERIALIZERS.get(serializer)

View File

@ -0,0 +1,76 @@
import os
import logging
import re
import locale
from dataset.util import FreezeException, slug
TMPL_KEY = re.compile("{{([^}]*)}}")
OPERATIONS = {
'identity': lambda x: x,
'lower': lambda x: unicode(x).lower(),
'slug': slug
}
class Serializer(object):
def __init__(self, engine):
self.engine = engine
self.config = engine.config
self._paths = []
self._get_basepath()
def _get_basepath(self):
prefix = self.config.get('prefix')
prefix = os.path.abspath(prefix)
prefix = os.path.realpath(prefix)
self._prefix = prefix
filename = self.config.get('filename')
if filename is None:
raise FreezeException("No 'filename' is specified")
self._basepath = os.path.join(prefix, filename)
def _tmpl(self, data):
def repl(m):
op, key = 'identity', m.group(1)
if ':' in key:
op, key = key.split(':', 1)
return unicode(OPERATIONS.get(op)(data.get(key, '')))
path = TMPL_KEY.sub(repl, self._basepath)
enc = locale.getpreferredencoding()
return os.path.realpath(path.encode(enc, 'replace'))
def file_name(self, row):
path = self._tmpl(row)
if path not in self._paths:
if not path.startswith(self._prefix):
raise FreezeException("Possible path escape detected.")
dn = os.path.dirname(path)
if not os.path.isdir(dn):
os.makedirs(dn)
self._paths.append(path)
return path
@property
def mode(self):
mode = self.config.get_normalized('mode', 'list')
if mode not in ['list', 'item']:
raise FreezeException("Invalid mode: %s" % mode)
return mode
@property
def wrap(self):
return self.config.get_bool('wrap',
default=self.mode=='list')
def serialize(self):
self.init()
query = self.engine.query()
for row in query:
self.write(self.file_name(row), row)
self.close()

View File

@ -0,0 +1,37 @@
import csv
from datetime import datetime
from dataset.freeze.format.common import Serializer
def value_to_str(value):
if isinstance(value, datetime):
return value.isoformat()
if isinstance(value, unicode):
return value.encode('utf-8')
if value is None:
return ''
return value
class CSVSerializer(Serializer):
def init(self):
self.handles = {}
def write(self, path, result):
keys = result.keys()
if not path in self.handles:
fh = open(path, 'wb')
writer = csv.writer(fh)
writer.writerow([k.encode('utf-8') for k in keys])
self.handles[path] = (writer, fh)
writer, fh = self.handles[path]
values = [value_to_str(result.get(k)) for k in keys]
writer.writerow(values)
def close(self):
for (writer, fh) in self.handles.values():
fh.close()

View File

@ -0,0 +1,45 @@
import json
from datetime import datetime
from collections import defaultdict
from dataset.freeze.format.common import Serializer
class JSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, datetime):
return obj.isoformat()
class JSONSerializer(Serializer):
def init(self):
self.buckets = defaultdict(list)
def write(self, path, result):
self.buckets[path].append(result)
def wrap(self, result):
count = len(result)
if self.mode == 'item':
result = result[0]
if self.wrap:
result = {
'count': count,
'results': result
}
meta = self.config.get('meta')
if meta is not None:
result['meta'] = meta
return result
def close(self):
for path, result in self.buckets.items():
result = self.wrap(result)
fh = open(path, 'wb')
json.dump(result, fh,
cls=JSONEncoder,
indent=self.config.get_int('indent'))
fh.close()

View File

@ -0,0 +1,24 @@
from dataset.freeze.format.fjson import JSONSerializer
class TabsonSerializer(JSONSerializer):
def wrap(self, result):
fields = []
data = []
if len(result):
keys = result[0].keys()
fields = [{'id': k} for k in keys]
for row in result:
d = [row.get(k) for k in keys]
data.append(d)
result = {
'count': len(result),
'fields': fields,
'data': data
}
meta = self.config.get('meta')
if meta is not None:
result['meta'] = meta
return result

54
dataset/util.py Normal file
View File

@ -0,0 +1,54 @@
#coding: utf-8
import re
from unicodedata import normalize as ucnorm, category
SLUG_REMOVE = re.compile(r'[,\s\.\(\)/\\;:]*')
class DatasetException(Exception):
pass
class FreezeException(DatasetException):
pass
def normalize(text):
""" Simplify a piece of text to generate a more canonical
representation. This involves lowercasing, stripping trailing
spaces, removing symbols, diacritical marks (umlauts) and
converting all newlines etc. to single spaces.
"""
if not isinstance(text, unicode):
text = unicode(text)
text = text.lower()
decomposed = ucnorm('NFKD', text)
filtered = []
for char in decomposed:
cat = category(char)
if cat.startswith('C'):
filtered.append(' ')
elif cat.startswith('M'):
# marks, such as umlauts
continue
elif cat.startswith('Z'):
# newlines, non-breaking etc.
filtered.append(' ')
elif cat.startswith('S'):
# symbols, such as currency
continue
else:
filtered.append(char)
text = u''.join(filtered)
while ' ' in text:
text = text.replace(' ', ' ')
text = text.strip()
return ucnorm('NFKC', text)
def slug(text):
""" Create a version of a string convenient for use in a URL
or file name. """
text = normalize(text)
text = text.replace(u'ß', 'ss')
text = '-'.join(filter(lambda t: len(t), \
SLUG_REMOVE.split(text)))
return text.lower()

View File

@ -23,9 +23,14 @@ setup(
zip_safe=False, zip_safe=False,
install_requires=[ install_requires=[
'sqlalchemy>=0.7', 'sqlalchemy>=0.7',
'sqlalchemy-migrate>=0.7' 'sqlalchemy-migrate>=0.7',
"argparse >= 1.2.1",
"PyYAML >= 3.10"
], ],
tests_require=[], tests_require=[],
entry_points=\ entry_points={
""" """, 'console_scripts': [
'datafreeze = dataset.freeze.app:main',
]
}
) )