From c2d606bc6f604c6fbd89c99cfaea9150cd41f652 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Thu, 4 Apr 2013 23:05:19 +0200 Subject: [PATCH] Import datafreeze source code. --- dataset/freeze/__init__.py | 0 dataset/freeze/app.py | 37 ++++++++++++++ dataset/freeze/config.py | 82 +++++++++++++++++++++++++++++++ dataset/freeze/engine.py | 42 ++++++++++++++++ dataset/freeze/format/__init__.py | 14 ++++++ dataset/freeze/format/common.py | 76 ++++++++++++++++++++++++++++ dataset/freeze/format/fcsv.py | 37 ++++++++++++++ dataset/freeze/format/fjson.py | 45 +++++++++++++++++ dataset/freeze/format/ftabson.py | 24 +++++++++ dataset/util.py | 54 ++++++++++++++++++++ setup.py | 11 +++-- 11 files changed, 419 insertions(+), 3 deletions(-) create mode 100644 dataset/freeze/__init__.py create mode 100644 dataset/freeze/app.py create mode 100644 dataset/freeze/config.py create mode 100644 dataset/freeze/engine.py create mode 100644 dataset/freeze/format/__init__.py create mode 100644 dataset/freeze/format/common.py create mode 100644 dataset/freeze/format/fcsv.py create mode 100644 dataset/freeze/format/fjson.py create mode 100644 dataset/freeze/format/ftabson.py create mode 100644 dataset/util.py diff --git a/dataset/freeze/__init__.py b/dataset/freeze/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dataset/freeze/app.py b/dataset/freeze/app.py new file mode 100644 index 0000000..1f57700 --- /dev/null +++ b/dataset/freeze/app.py @@ -0,0 +1,37 @@ +import logging +import argparse + +from dataset.util import FreezeException +from dataset.freeze.config import Configuration +from dataset.freeze.engine import ExportEngine +from dataset.freeze.format import get_serializer + + +logging.basicConfig(level=logging.DEBUG) +log = logging.getLogger(__name__) + +parser = argparse.ArgumentParser( + description='Generate static JSON and CSV extracts from a SQL database.', + epilog='For further information, please check the documentation.') +parser.add_argument('config', metavar='CONFIG', type=str, + help='freeze file cofiguration') + +def main(): + try: + args = parser.parse_args() + config = Configuration(args.config) + for export in config.exports: + if export.skip: + log.info("Skipping: %s", export.name) + continue + log.info("Running: %s", export.name) + engine = ExportEngine(export) + query = engine.query() + serializer_cls = get_serializer(export) + serializer = serializer_cls(engine) + serializer.serialize() + except FreezeException, fe: + log.error(fe) + +if __name__ == '__main__': + main() diff --git a/dataset/freeze/config.py b/dataset/freeze/config.py new file mode 100644 index 0000000..bb5756c --- /dev/null +++ b/dataset/freeze/config.py @@ -0,0 +1,82 @@ +import json +import yaml + + +TRUISH = ['true', 'yes', '1', 'on'] + +DECODER = { + 'json': json, + 'yaml': yaml + } + + +def merge_overlay(data, overlay): + out = overlay.copy() + for k, v in data.items(): + if isinstance(v, dict) and isinstance(out.get(k), dict): + v = merge_overlay(v, out.get(k)) + out[k] = v + return out + + +class Configuration(object): + + def __init__(self, file_name): + self.file_name = file_name + extension = file_name.rsplit('.', 1)[-1] + loader = DECODER.get(extension, json) + try: + fh = open(file_name, 'rb') + try: + self.data = loader.load(fh) + except ValueError, ve: + raise FreezeException("Invalid freeze file: %s" % ve) + fh.close() + except IOError, ioe: + raise FreezeException(unicode(ioe)) + + @property + def exports(self): + if not isinstance(self.data, dict): + raise FreezeException("The root element of the freeze file needs to be a hash") + if not isinstance(self.data.get('exports'), list): + raise FreezeException("The freeze file needs to have a list of exports") + common = self.data.get('common', {}) + for export in self.data.get('exports'): + yield Export(common, export) + + +class Export(object): + + def __init__(self, common, data): + self.data = merge_overlay(data, common) + + def get(self, name, default=None): + return self.data.get(name, default) + + def get_normalized(self, name, default=None): + value = self.get(name, default=default) + if not value in [None, default]: + value = unicode(value).lower().strip() + return value + + def get_bool(self, name, default=False): + value = self.get_normalized(name) + if value is None: + return default + return value in TRUISH + + def get_int(self, name, default=None): + value = self.get_normalized(name) + if value is None: + return default + return int(value) + + @property + def skip(self): + return self.get_bool('skip') + + @property + def name(self): + return self.get('name', self.get('query')) + diff --git a/dataset/freeze/engine.py b/dataset/freeze/engine.py new file mode 100644 index 0000000..3fcf229 --- /dev/null +++ b/dataset/freeze/engine.py @@ -0,0 +1,42 @@ +from sqlalchemy import create_engine +from sqlalchemy.exc import ProgrammingError + +from dataset.util import FreezeException + +class Query(object): + + def __init__(self, query, rp): + self.query = query + self.rp = rp + + def __len__(self): + return self.rp.rowcount + + def __iter__(self): + keys = self.rp.keys() + while True: + row = self.rp.fetchone() + if row is None: + return + yield dict(zip(keys, row)) + + +class ExportEngine(object): + + def __init__(self, config): + self.config = config + + @property + def engine(self): + if not hasattr(self, '_engine'): + self._engine = create_engine(self.config.get('database')) + return self._engine + + def query(self): + try: + q = self.config.get('query') + rp = self.engine.execute(q) + return Query(q, rp) + except ProgrammingError, pe: + raise FreezeException("Invalid query: %s - %s" % (q, pe)) + diff --git a/dataset/freeze/format/__init__.py b/dataset/freeze/format/__init__.py new file mode 100644 index 0000000..675a84e --- /dev/null +++ b/dataset/freeze/format/__init__.py @@ -0,0 +1,14 @@ +from dataset.freeze.format.fjson import JSONSerializer +from dataset.freeze.format.fcsv import CSVSerializer +from dataset.freeze.format.ftabson import TabsonSerializer + +SERIALIZERS = { + 'json': JSONSerializer, + 'csv': CSVSerializer, + 'tabson': TabsonSerializer + } + + +def get_serializer(config): + serializer = config.get_normalized('format', 'json') + return SERIALIZERS.get(serializer) diff --git a/dataset/freeze/format/common.py b/dataset/freeze/format/common.py new file mode 100644 index 0000000..bc00af5 --- /dev/null +++ b/dataset/freeze/format/common.py @@ -0,0 +1,76 @@ +import os +import logging +import re +import locale + +from dataset.util import FreezeException, slug + + +TMPL_KEY = re.compile("{{([^}]*)}}") + +OPERATIONS = { + 'identity': lambda x: x, + 'lower': lambda x: unicode(x).lower(), + 'slug': slug + } + + +class Serializer(object): + + def __init__(self, engine): + self.engine = engine + self.config = engine.config + self._paths = [] + self._get_basepath() + + def _get_basepath(self): + prefix = self.config.get('prefix') + prefix = os.path.abspath(prefix) + prefix = os.path.realpath(prefix) + self._prefix = prefix + filename = self.config.get('filename') + if filename is None: + raise FreezeException("No 'filename' is specified") + self._basepath = os.path.join(prefix, filename) + + def _tmpl(self, data): + def repl(m): + op, key = 'identity', m.group(1) + if ':' in key: + op, key = key.split(':', 1) + return unicode(OPERATIONS.get(op)(data.get(key, ''))) + path = TMPL_KEY.sub(repl, self._basepath) + enc = locale.getpreferredencoding() + return os.path.realpath(path.encode(enc, 'replace')) + + def file_name(self, row): + path = self._tmpl(row) + if path not in self._paths: + if not path.startswith(self._prefix): + raise FreezeException("Possible path escape detected.") + dn = os.path.dirname(path) + if not os.path.isdir(dn): + os.makedirs(dn) + self._paths.append(path) + return path + + @property + def mode(self): + mode = self.config.get_normalized('mode', 'list') + if mode not in ['list', 'item']: + raise FreezeException("Invalid mode: %s" % mode) + return mode + + @property + def wrap(self): + return self.config.get_bool('wrap', + default=self.mode=='list') + + def serialize(self): + self.init() + query = self.engine.query() + for row in query: + self.write(self.file_name(row), row) + self.close() + + diff --git a/dataset/freeze/format/fcsv.py b/dataset/freeze/format/fcsv.py new file mode 100644 index 0000000..00413f6 --- /dev/null +++ b/dataset/freeze/format/fcsv.py @@ -0,0 +1,37 @@ +import csv +from datetime import datetime + +from dataset.freeze.format.common import Serializer + + +def value_to_str(value): + if isinstance(value, datetime): + return value.isoformat() + if isinstance(value, unicode): + return value.encode('utf-8') + if value is None: + return '' + return value + + +class CSVSerializer(Serializer): + + def init(self): + self.handles = {} + + def write(self, path, result): + keys = result.keys() + if not path in self.handles: + fh = open(path, 'wb') + writer = csv.writer(fh) + writer.writerow([k.encode('utf-8') for k in keys]) + self.handles[path] = (writer, fh) + writer, fh = self.handles[path] + values = [value_to_str(result.get(k)) for k in keys] + writer.writerow(values) + + def close(self): + for (writer, fh) in self.handles.values(): + fh.close() + + diff --git a/dataset/freeze/format/fjson.py b/dataset/freeze/format/fjson.py new file mode 100644 index 0000000..446a33e --- /dev/null +++ b/dataset/freeze/format/fjson.py @@ -0,0 +1,45 @@ +import json +from datetime import datetime +from collections import defaultdict + +from dataset.freeze.format.common import Serializer + + +class JSONEncoder(json.JSONEncoder): + + def default(self, obj): + if isinstance(obj, datetime): + return obj.isoformat() + + +class JSONSerializer(Serializer): + + def init(self): + self.buckets = defaultdict(list) + + def write(self, path, result): + self.buckets[path].append(result) + + def wrap(self, result): + count = len(result) + if self.mode == 'item': + result = result[0] + if self.wrap: + result = { + 'count': count, + 'results': result + } + meta = self.config.get('meta') + if meta is not None: + result['meta'] = meta + return result + + def close(self): + for path, result in self.buckets.items(): + result = self.wrap(result) + fh = open(path, 'wb') + json.dump(result, fh, + cls=JSONEncoder, + indent=self.config.get_int('indent')) + fh.close() + diff --git a/dataset/freeze/format/ftabson.py b/dataset/freeze/format/ftabson.py new file mode 100644 index 0000000..f49aefc --- /dev/null +++ b/dataset/freeze/format/ftabson.py @@ -0,0 +1,24 @@ +from dataset.freeze.format.fjson import JSONSerializer + + +class TabsonSerializer(JSONSerializer): + + def wrap(self, result): + fields = [] + data = [] + if len(result): + keys = result[0].keys() + fields = [{'id': k} for k in keys] + for row in result: + d = [row.get(k) for k in keys] + data.append(d) + result = { + 'count': len(result), + 'fields': fields, + 'data': data + } + meta = self.config.get('meta') + if meta is not None: + result['meta'] = meta + return result + diff --git a/dataset/util.py b/dataset/util.py new file mode 100644 index 0000000..339a5fa --- /dev/null +++ b/dataset/util.py @@ -0,0 +1,54 @@ +#coding: utf-8 +import re +from unicodedata import normalize as ucnorm, category + +SLUG_REMOVE = re.compile(r'[,\s\.\(\)/\\;:]*') + +class DatasetException(Exception): + pass + +class FreezeException(DatasetException): + pass + + +def normalize(text): + """ Simplify a piece of text to generate a more canonical + representation. This involves lowercasing, stripping trailing + spaces, removing symbols, diacritical marks (umlauts) and + converting all newlines etc. to single spaces. + """ + if not isinstance(text, unicode): + text = unicode(text) + text = text.lower() + decomposed = ucnorm('NFKD', text) + filtered = [] + for char in decomposed: + cat = category(char) + if cat.startswith('C'): + filtered.append(' ') + elif cat.startswith('M'): + # marks, such as umlauts + continue + elif cat.startswith('Z'): + # newlines, non-breaking etc. + filtered.append(' ') + elif cat.startswith('S'): + # symbols, such as currency + continue + else: + filtered.append(char) + text = u''.join(filtered) + while ' ' in text: + text = text.replace(' ', ' ') + text = text.strip() + return ucnorm('NFKC', text) + +def slug(text): + """ Create a version of a string convenient for use in a URL + or file name. """ + text = normalize(text) + text = text.replace(u'ß', 'ss') + text = '-'.join(filter(lambda t: len(t), \ + SLUG_REMOVE.split(text))) + return text.lower() + diff --git a/setup.py b/setup.py index 34dedc2..6962a7e 100644 --- a/setup.py +++ b/setup.py @@ -23,9 +23,14 @@ setup( zip_safe=False, install_requires=[ 'sqlalchemy>=0.7', - 'sqlalchemy-migrate>=0.7' + 'sqlalchemy-migrate>=0.7', + "argparse >= 1.2.1", + "PyYAML >= 3.10" ], tests_require=[], - entry_points=\ - """ """, + entry_points={ + 'console_scripts': [ + 'datafreeze = dataset.freeze.app:main', + ] + } )