diff --git a/Example.yaml b/Example.yaml deleted file mode 100644 index 2bec029..0000000 --- a/Example.yaml +++ /dev/null @@ -1,22 +0,0 @@ -common: - - database: "postgresql://user:password@localhost/operational_database" - prefix: my_project/dumps/ - format: json - -exports: - - - query: "SELECT id, title, date FROM events" - filename: "index.json" - - - query: "SELECT id, title, date, country FROM events" - filename: "countries/{{country}}.csv" - format: csv - - - query: "SELECT * FROM events" - filename: "events/{{id}}.json" - mode: item - - - query: "SELECT * FROM events" - filename: "all.json" - format: tabson diff --git a/dataset/__init__.py b/dataset/__init__.py index 6fd40c2..89e4da2 100644 --- a/dataset/__init__.py +++ b/dataset/__init__.py @@ -3,7 +3,6 @@ import warnings from dataset.persistence.database import Database from dataset.persistence.table import Table from dataset.persistence.util import row_type -from dataset.freeze.app import freeze # shut up useless SA warning: warnings.filterwarnings( diff --git a/dataset/persistence/database.py b/dataset/database.py similarity index 97% rename from dataset/persistence/database.py rename to dataset/database.py index 1e81559..88b0505 100644 --- a/dataset/persistence/database.py +++ b/dataset/database.py @@ -14,10 +14,10 @@ from sqlalchemy.engine.reflection import Inspector from alembic.migration import MigrationContext from alembic.operations import Operations -from dataset.persistence.table import Table -from dataset.persistence.util import ResultIter, row_type, safe_url, QUERY_STEP -from dataset.persistence.util import normalize_table_name -from dataset.persistence.types import Types +from dataset.table import Table +from dataset.util import ResultIter, row_type, safe_url, QUERY_STEP +from dataset.util import normalize_table_name +from dataset.types import Types log = logging.getLogger(__name__) diff --git a/dataset/freeze/__init__.py b/dataset/freeze/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/dataset/freeze/app.py b/dataset/freeze/app.py deleted file mode 100644 index 141091c..0000000 --- a/dataset/freeze/app.py +++ /dev/null @@ -1,167 +0,0 @@ -import logging -import argparse - -from sqlalchemy.exc import ProgrammingError, OperationalError -from dataset.util import FreezeException -from dataset.persistence.table import Table -from dataset.persistence.database import Database -from dataset.freeze.config import Configuration, Export -from dataset.freeze.format import get_serializer - - -log = logging.getLogger(__name__) - - -def create_parser(): - parser = argparse.ArgumentParser( - prog='datafreeze', - description='Generate static JSON and CSV extracts from a SQL database.', - epilog='For further information, please check the documentation.') - parser.add_argument('config', metavar='CONFIG', type=str, - help='freeze file cofiguration') - parser.add_argument('--db', default=None, - help='Override the freezefile database URI') - return parser - - -def freeze(result, format='csv', filename='freeze.csv', fileobj=None, - prefix='.', mode='list', **kw): - """ - Perform a data export of a given result set. This is a very - flexible exporter, allowing for various output formats, metadata - assignment, and file name templating to dump each record (or a set - of records) into individual files. - - :: - - result = db['person'].all() - dataset.freeze(result, format='json', filename='all-persons.json') - - Instead of passing in the file name, you can also pass a file object:: - - result = db['person'].all() - fh = open('/dev/null', 'wb') - dataset.freeze(result, format='json', fileobj=fh) - - Be aware that this will disable file name templating and store all - results to the same file. - - If ``result`` is a table (rather than a result set), all records in - the table are exported (as if ``result.all()`` had been called). - - - freeze supports two values for ``mode``: - - *list* (default) - The entire result set is dumped into a single file. - - *item* - One file is created for each row in the result set. - - You should set a ``filename`` for the exported file(s). If ``mode`` - is set to *item* the function would generate one file per row. In - that case you can use values as placeholders in filenames:: - - dataset.freeze(res, mode='item', format='json', - filename='item-{{id}}.json') - - The following output ``format`` s are supported: - - *csv* - Comma-separated values, first line contains column names. - - *json* - A JSON file containing a list of dictionaries for each row - in the table. If a ``callback`` is given, JSON with padding - (JSONP) will be generated. - - *tabson* - Tabson is a smart combination of the space-efficiency of the - CSV and the parsability and structure of JSON. - - You can pass additional named parameters specific to the used format. - - As an example, you can freeze to minified JSON with the following: - - dataset.freeze(res, format='json', indent=4, wrap=False, - filename='output.json') - - *json* and *tabson* - *callback*: - if provided, generate a JSONP string using the given callback - function, i.e. something like `callback && callback({...})` - - *indent*: - if *indent* is a non-negative integer (it is ``2`` by default - when you call `dataset.freeze`, and ``None`` via the - ``datafreeze`` command), then JSON array elements and object - members will be pretty-printed with that indent level. - An indent level of 0 will only insert newlines. - ``None`` is the most compact representation. - - *meta*: - if *meta* is not ``None`` (default: ``{}``), it will be included - in the JSON output (for *json*, only if *wrap* is ``True``). - - *wrap* (only for *json*): - if *wrap* is ``True`` (default), the JSON output is an object - of the form ``{"count": 2, "results": [...]}``. - if ``meta`` is not ``None``, a third property ``meta`` is added - to the wrapping object, with this value. - """ - kw.update({ - 'format': format, - 'filename': filename, - 'fileobj': fileobj, - 'prefix': prefix, - 'mode': mode - }) - - # Special cases when freezing comes from dataset.freeze - if format in ['json', 'tabson'] and 'indent' not in kw: - kw['indent'] = 2 - - records = result.all() if isinstance(result, Table) else result - return freeze_export(Export({}, kw), result=records) - - -def freeze_export(export, result=None): - try: - if result is None: - database = Database(export.get('database')) - query = database.query(export.get('query')) - else: - query = result - serializer_cls = get_serializer(export) - serializer = serializer_cls(export, query) - serializer.serialize() - except (OperationalError, ProgrammingError) as e: - raise FreezeException("Invalid query: %s" % e) - - -def freeze_with_config(config, db=None): - for export in config.exports: - if db is not None: - export.data['database'] = db - if export.skip: - log.info("Skipping: %s", export.name) - continue - log.info("Running: %s", export.name) - freeze_export(export) - - -def main(): # pragma: no cover - # Set up default logger. - logging.basicConfig(level=logging.INFO) - - try: - parser = create_parser() - args = parser.parse_args() - freeze_with_config(Configuration(args.config), args.db) - except FreezeException as fe: - log.error(fe) - - -if __name__ == '__main__': # pragma: no cover - logging.basicConfig(level=logging.DEBUG) - main() diff --git a/dataset/freeze/config.py b/dataset/freeze/config.py deleted file mode 100644 index a34199e..0000000 --- a/dataset/freeze/config.py +++ /dev/null @@ -1,88 +0,0 @@ -import json -import yaml - -from six import text_type, PY3 - -from dataset.util import FreezeException - - -TRUISH = ['true', 'yes', '1', 'on'] - -DECODER = { - 'json': json, - 'yaml': yaml - } - - -def merge_overlay(data, overlay): - out = overlay.copy() - for k, v in data.items(): - if isinstance(v, dict) and isinstance(out.get(k), dict): - v = merge_overlay(v, out.get(k)) - out[k] = v - return out - - -class Configuration(object): - - def __init__(self, file_name): - self.file_name = file_name - extension = file_name.rsplit('.', 1)[-1] - loader = DECODER.get(extension, json) - try: - if loader == json and PY3: # pragma: no cover - fh = open(file_name, encoding='utf8') - else: - fh = open(file_name, 'rb') - try: - self.data = loader.load(fh) - except ValueError as ve: - raise FreezeException("Invalid freeze file: %s" % ve) - fh.close() - except IOError as ioe: - raise FreezeException(text_type(ioe)) - - @property - def exports(self): - if not isinstance(self.data, dict): - raise FreezeException("The root element of the freeze file needs to be a hash") - if not isinstance(self.data.get('exports'), list): - raise FreezeException("The freeze file needs to have a list of exports") - common = self.data.get('common', {}) - for export in self.data.get('exports'): - yield Export(common, export) - - -class Export(object): - - def __init__(self, common, data): - self.data = merge_overlay(data, common) - - def get(self, name, default=None): - return self.data.get(name, default) - - def get_normalized(self, name, default=None): - value = self.get(name, default=default) - if value not in [None, default]: - value = text_type(value).lower().strip() - return value - - def get_bool(self, name, default=False): - value = self.get_normalized(name) - if value is None: - return default - return value in TRUISH - - def get_int(self, name, default=None): - value = self.get_normalized(name) - if value is None: - return default - return int(value) - - @property - def skip(self): - return self.get_bool('skip') - - @property - def name(self): - return self.get('name', self.get('query')) diff --git a/dataset/freeze/format/__init__.py b/dataset/freeze/format/__init__.py deleted file mode 100644 index 675a84e..0000000 --- a/dataset/freeze/format/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -from dataset.freeze.format.fjson import JSONSerializer -from dataset.freeze.format.fcsv import CSVSerializer -from dataset.freeze.format.ftabson import TabsonSerializer - -SERIALIZERS = { - 'json': JSONSerializer, - 'csv': CSVSerializer, - 'tabson': TabsonSerializer - } - - -def get_serializer(config): - serializer = config.get_normalized('format', 'json') - return SERIALIZERS.get(serializer) diff --git a/dataset/freeze/format/common.py b/dataset/freeze/format/common.py deleted file mode 100644 index d3108e9..0000000 --- a/dataset/freeze/format/common.py +++ /dev/null @@ -1,93 +0,0 @@ -import os -import re -import sys -import locale - -from six import binary_type, text_type -from normality import slugify - -from dataset.util import FreezeException - - -TMPL_KEY = re.compile("{{([^}]*)}}") - -OPERATIONS = { - 'identity': lambda x: x, - 'lower': lambda x: text_type(x).lower(), - 'slug': slugify - } - - -class Serializer(object): - - def __init__(self, export, query): - self._encoding = locale.getpreferredencoding() - self.export = export - self.query = query - self._paths = [] - self._get_basepath() - - if export.get('filename') == '-': - export.data['fileobj'] = sys.stdout - self.fileobj = export.get('fileobj') - - def _get_basepath(self): - prefix = self.export.get('prefix', '') - if isinstance(prefix, binary_type): - prefix = text_type(prefix, encoding=self._encoding) - prefix = os.path.abspath(prefix) - prefix = os.path.realpath(prefix) - self._prefix = prefix - filename = self.export.get('filename') - if isinstance(filename, binary_type): - filename = text_type(filename, encoding=self._encoding) - if filename is None: - raise FreezeException("No 'filename' is specified") - self._basepath = os.path.join(prefix, filename) - - def _tmpl(self, data): - def repl(m): - op, key = 'identity', m.group(1) - if ':' in key: - op, key = key.split(':', 1) - return str(OPERATIONS.get(op)(data.get(key, ''))) - path = TMPL_KEY.sub(repl, self._basepath) - return os.path.realpath(path) - - def file_name(self, row): - # signal that there is a fileobj available: - if self.fileobj is not None: - return None - - path = self._tmpl(row) - if path not in self._paths: - if not path.startswith(self._prefix): - raise FreezeException("Possible path escape detected.") - dn = os.path.dirname(path) - if not os.path.isdir(dn): - os.makedirs(dn) - self._paths.append(path) - return path - - @property - def mode(self): - mode = self.export.get_normalized('mode', 'list') - if mode not in ['list', 'item']: - raise FreezeException("Invalid mode: %s" % mode) - return mode - - @property - def wrap(self): - return self.export.get_bool('wrap', default=self.mode == 'list') - - def serialize(self): - self.init() - transforms = self.export.get('transform', {}) - for row in self.query: - - for field, operation in transforms.items(): - row[field] = OPERATIONS.get(operation)(row.get(field)) - - self.write(self.file_name(row), row) - - self.close() diff --git a/dataset/freeze/format/fcsv.py b/dataset/freeze/format/fcsv.py deleted file mode 100644 index 37ac1ea..0000000 --- a/dataset/freeze/format/fcsv.py +++ /dev/null @@ -1,50 +0,0 @@ -from __future__ import unicode_literals -import csv -from datetime import datetime, date - -from six import PY3, text_type - -from dataset.freeze.format.common import Serializer - - -def value_to_str(value): - if isinstance(value, (date, datetime)): - return text_type(value.isoformat()) - if not PY3 and hasattr(value, 'encode'): - return value.encode('utf-8') - if value is None: - return '' - return value - - -class CSVSerializer(Serializer): - - def init(self): - self.handles = {} - - def write(self, path, result): - keys = list(result.keys()) - if path not in self.handles: - # handle fileobj that has been passed in: - if path is not None: - if PY3: # pragma: no cover - fh = open(path, 'wt', encoding='utf8', newline='') - else: - fh = open(path, 'wb') - else: - fh = self.fileobj - - writer = csv.writer(fh) - if PY3: # pragma: no cover - writer.writerow(keys) - else: - writer.writerow([value_to_str(k) for k in keys]) - self.handles[path] = (writer, fh) - writer, fh = self.handles[path] - values = [value_to_str(result.get(k)) for k in keys] - writer.writerow(values) - - def close(self): - for writer, fh in self.handles.values(): - if fh != self.fileobj: - fh.close() diff --git a/dataset/freeze/format/fjson.py b/dataset/freeze/format/fjson.py deleted file mode 100644 index 8086ccd..0000000 --- a/dataset/freeze/format/fjson.py +++ /dev/null @@ -1,63 +0,0 @@ -import json -from datetime import datetime, date -from collections import defaultdict, OrderedDict -from decimal import Decimal - -from six import PY3 - -from dataset.freeze.format.common import Serializer - - -class JSONEncoder(json.JSONEncoder): - - def default(self, obj): - if isinstance(obj, (datetime, date)): - return obj.isoformat() - if isinstance(obj, Decimal): - return str(obj) - - -class JSONSerializer(Serializer): - - def init(self): - self.buckets = defaultdict(list) - - def write(self, path, result): - self.buckets[path].append(result) - - def wrap(self, result): - if self.mode == 'item': - result = result[0] - if self.export.get_bool('wrap', True): - result = OrderedDict([ - ('count', len(result)), - ('results', result), - ]) - meta = self.export.get('meta', {}) - if meta is not None: - result['meta'] = meta - return result - - def close(self): - for path, result in self.buckets.items(): - result = self.wrap(result) - - if self.fileobj is None: - if PY3: # pragma: no cover - fh = open(path, 'w', encoding='utf8') - else: - fh = open(path, 'wb') - else: - fh = self.fileobj - - data = json.dumps(result, - cls=JSONEncoder, - indent=self.export.get_int('indent')) - - callback = self.export.get('callback') - if callback: - data = "%s && %s(%s);" % (callback, callback, data) - - fh.write(data) - if self.fileobj is None: - fh.close() diff --git a/dataset/freeze/format/ftabson.py b/dataset/freeze/format/ftabson.py deleted file mode 100644 index 279d051..0000000 --- a/dataset/freeze/format/ftabson.py +++ /dev/null @@ -1,23 +0,0 @@ -from dataset.freeze.format.fjson import JSONSerializer - - -class TabsonSerializer(JSONSerializer): - - def wrap(self, result): - fields = [] - data = [] - if len(result): - keys = list(result[0].keys()) - fields = [{'id': k} for k in keys] - for row in result: - d = [row.get(k) for k in keys] - data.append(d) - result = { - 'count': len(data), - 'fields': fields, - 'data': data - } - meta = self.export.get('meta', {}) - if meta is not None: - result['meta'] = meta - return result diff --git a/dataset/persistence/__init__.py b/dataset/persistence/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/dataset/persistence/util.py b/dataset/persistence/util.py deleted file mode 100644 index 5a71d60..0000000 --- a/dataset/persistence/util.py +++ /dev/null @@ -1,102 +0,0 @@ -try: - from urlparse import urlparse -except ImportError: - from urllib.parse import urlparse - -try: - from collections import OrderedDict -except ImportError: # pragma: no cover - from ordereddict import OrderedDict - -from six import string_types -from collections import Sequence -from hashlib import sha1 - -QUERY_STEP = 1000 -row_type = OrderedDict - - -def convert_row(row_type, row): - if row is None: - return None - return row_type(row.items()) - - -def iter_result_proxy(rp, step=None): - """Iterate over the ResultProxy.""" - while True: - if step is None: - chunk = rp.fetchall() - else: - chunk = rp.fetchmany(step) - if not chunk: - break - for row in chunk: - yield row - - -class ResultIter(object): - """ SQLAlchemy ResultProxies are not iterable to get a - list of dictionaries. This is to wrap them. """ - - def __init__(self, result_proxy, row_type=row_type, step=None): - self.row_type = row_type - self.result_proxy = result_proxy - self.keys = list(result_proxy.keys()) - self._iter = iter_result_proxy(result_proxy, step=step) - - def __next__(self): - return convert_row(self.row_type, next(self._iter)) - - next = __next__ - - def __iter__(self): - return self - - def close(self): - self.result_proxy.close() - - -def normalize_column_name(name): - """Check if a string is a reasonable thing to use as a column name.""" - if not isinstance(name, string_types): - raise ValueError('%r is not a valid column name.' % name) - name = name.strip() - if not len(name) or '.' in name or '-' in name: - raise ValueError('%r is not a valid column name.' % name) - return name - - -def normalize_table_name(name): - """Check if the table name is obviously invalid.""" - if not isinstance(name, string_types): - raise ValueError("Invalid table name: %r" % name) - name = name.strip() - if not len(name): - raise ValueError("Invalid table name: %r" % name) - return name - - -def safe_url(url): - """Remove password from printed connection URLs.""" - parsed = urlparse(url) - if parsed.password is not None: - pwd = ':%s@' % parsed.password - url = url.replace(pwd, ':*****@') - return url - - -def index_name(table, columns): - """Generate an artificial index name.""" - sig = '||'.join(columns) - key = sha1(sig.encode('utf-8')).hexdigest()[:16] - return 'ix_%s_%s' % (table, key) - - -def ensure_tuple(obj): - """Try and make the given argument into a tuple.""" - if obj is None: - return tuple() - if isinstance(obj, Sequence) and not isinstance(obj, string_types): - return tuple(obj) - return obj, diff --git a/dataset/persistence/table.py b/dataset/table.py similarity index 98% rename from dataset/persistence/table.py rename to dataset/table.py index 98cf4f7..04a4704 100644 --- a/dataset/persistence/table.py +++ b/dataset/table.py @@ -9,11 +9,10 @@ from sqlalchemy import func, select, false from sqlalchemy.schema import Table as SQLATable from sqlalchemy.exc import NoSuchTableError -from dataset.persistence.types import Types -from dataset.persistence.util import normalize_column_name, index_name -from dataset.persistence.util import ensure_tuple, ResultIter, QUERY_STEP -from dataset.persistence.util import normalize_table_name -from dataset.util import DatasetException +from dataset.types import Types +from dataset.util import normalize_column_name, index_name, ensure_tuple +from dataset.util import DatasetException, ResultIter, QUERY_STEP +from dataset.util import normalize_table_name log = logging.getLogger(__name__) diff --git a/dataset/persistence/types.py b/dataset/types.py similarity index 100% rename from dataset/persistence/types.py rename to dataset/types.py diff --git a/dataset/util.py b/dataset/util.py index ce33267..6995dc4 100644 --- a/dataset/util.py +++ b/dataset/util.py @@ -1,12 +1,105 @@ -# coding: utf-8 -import re +try: + from urlparse import urlparse +except ImportError: + from urllib.parse import urlparse -SLUG_REMOVE = re.compile(r'[,\s\.\(\)/\\;:]*') +try: + from collections import OrderedDict +except ImportError: # pragma: no cover + from ordereddict import OrderedDict +from six import string_types +from collections import Sequence +from hashlib import sha1 + +QUERY_STEP = 1000 +row_type = OrderedDict class DatasetException(Exception): pass -class FreezeException(DatasetException): - pass +def convert_row(row_type, row): + if row is None: + return None + return row_type(row.items()) + + +def iter_result_proxy(rp, step=None): + """Iterate over the ResultProxy.""" + while True: + if step is None: + chunk = rp.fetchall() + else: + chunk = rp.fetchmany(step) + if not chunk: + break + for row in chunk: + yield row + + +class ResultIter(object): + """ SQLAlchemy ResultProxies are not iterable to get a + list of dictionaries. This is to wrap them. """ + + def __init__(self, result_proxy, row_type=row_type, step=None): + self.row_type = row_type + self.result_proxy = result_proxy + self.keys = list(result_proxy.keys()) + self._iter = iter_result_proxy(result_proxy, step=step) + + def __next__(self): + return convert_row(self.row_type, next(self._iter)) + + next = __next__ + + def __iter__(self): + return self + + def close(self): + self.result_proxy.close() + + +def normalize_column_name(name): + """Check if a string is a reasonable thing to use as a column name.""" + if not isinstance(name, string_types): + raise ValueError('%r is not a valid column name.' % name) + name = name.strip() + if not len(name) or '.' in name or '-' in name: + raise ValueError('%r is not a valid column name.' % name) + return name + + +def normalize_table_name(name): + """Check if the table name is obviously invalid.""" + if not isinstance(name, string_types): + raise ValueError("Invalid table name: %r" % name) + name = name.strip() + if not len(name): + raise ValueError("Invalid table name: %r" % name) + return name + + +def safe_url(url): + """Remove password from printed connection URLs.""" + parsed = urlparse(url) + if parsed.password is not None: + pwd = ':%s@' % parsed.password + url = url.replace(pwd, ':*****@') + return url + + +def index_name(table, columns): + """Generate an artificial index name.""" + sig = '||'.join(columns) + key = sha1(sig.encode('utf-8')).hexdigest()[:16] + return 'ix_%s_%s' % (table, key) + + +def ensure_tuple(obj): + """Try and make the given argument into a tuple.""" + if obj is None: + return tuple() + if isinstance(obj, Sequence) and not isinstance(obj, string_types): + return tuple(obj) + return obj, diff --git a/docs/conf.py b/docs/conf.py index 020b702..38b6972 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -48,9 +48,9 @@ copyright = u'2013-2015, Friedrich Lindenberg, Gregor Aisch, Stefan Wehrmeyer' # built documents. # # The short X.Y version. -version = '0.6' +version = '1.0' # The full version, including alpha/beta/rc tags. -release = '0.6.0' +release = '1.0.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/freezefile.rst b/docs/freezefile.rst deleted file mode 100644 index 9561f71..0000000 --- a/docs/freezefile.rst +++ /dev/null @@ -1,98 +0,0 @@ - -Freezefiles and the ``datafreeze`` command -========================================== - -``datafreeze`` creates static extracts of SQL databases for use in interactive -web applications. SQL databases are a great way to manage relational data, but -exposing them on the web to drive data apps can be cumbersome. Often, the -capacities of a proper database are not actually required, a few static JSON -files and a bit of JavaScript can have the same effect. Still, exporting JSON -by hand (or with a custom script) can also become a messy process. - -With ``datafreeze``, exports are scripted in a Makefile-like description, making them simple to repeat and replicate. - - -Basic Usage ------------ - -Calling DataFreeze is simple, the application is called with a -freeze file as its argument: - -.. code-block:: bash - - datafreeze Freezefile.yaml - -Freeze files can be either written in JSON or in YAML. The database URI -indicated in the Freezefile can also be overridden via the command line: - - datafreeze --db sqlite:///foo.db Freezefile.yaml - - -Example Freezefile.yaml ------------------------ - -A freeze file is composed of a set of scripted queries and -specifications on how their output is to be handled. An example could look -like this: - -.. code-block:: yaml - - common: - - database: "postgresql://user:password@localhost/operational_database" - prefix: my_project/dumps/ - format: json - - exports: - - - query: "SELECT id, title, date FROM events" - filename: "index.json" - - - query: "SELECT id, title, date, country FROM events" - filename: "countries/{{country}}.csv" - format: csv - - - query: "SELECT * FROM events" - filename: "events/{{id}}.json" - mode: item - - - query: "SELECT * FROM events" - filename: "all.json" - format: tabson - -An identical JSON configuration can be found in this repository. - - -Options in detail ------------------ - -The freeze file has two main sections, ``common`` and ``exports``. Both -accept many of the same arguments, with ``exports`` specifying a list of -exports while ``common`` defines some shared properties, such as the -database connection string. - -The following options are recognized: - -* ``database`` is a database URI, including the database type, username - and password, hostname and database name. Valid database types include - ``sqlite``, ``mysql`` and ``postgresql`` (requires psycopg2). -* ``prefix`` specifies a common root directory for all extracted files. -* ``format`` identifies the format to be generated, ``csv``, ``json`` and - ``tabson`` are supported. ``tabson`` is a condensed JSON - representation in which rows are not represented by objects but by - lists of values. -* ``query`` needs to be a valid SQL statement. All selected fields will - become keys or columns in the output, so it may make sense to define - proper aliases if any overlap is to be expected. -* ``mode`` specifies whether the query output is to be combined into a - single file (``list``) or whether a file should be generated for each - result row (``item``). -* ``filename`` is the output file name, appended to ``prefix``. All - occurences of ``{{field}}`` are expanded to a fields value to allow the - generation of file names e.g. by primary key. In list mode, templating - can be used to group records into several buckets, e.g. by country or - category. -* ``wrap`` can be used to specify whether the output should be wrapped - in a ``results`` hash in JSON output. This defaults to ``true`` for - ``list``-mode output and ``false`` for ``item``-mode. - diff --git a/docs/index.rst b/docs/index.rst index fd82e00..c299112 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,22 +10,19 @@ dataset: databases for lazy people :hidden: -Although managing data in relational database has plenty of benefits, they're rarely used in day-to-day work with small to medium scale datasets. But why is that? Why do we see an awful lot of data stored in static files in CSV or JSON format, even though they are hard -to query and update incrementally? +Although managing data in relational database has plenty of benefits, they're +rarely used in day-to-day work with small to medium scale datasets. But why is +that? Why do we see an awful lot of data stored in static files in CSV or JSON +format, even though they are hard to query and update incrementally? -The answer is that **programmers are lazy**, and thus they tend to prefer the easiest solution they find. And in **Python**, a database isn't the simplest solution for storing a bunch of structured data. This is what **dataset** is going to change! +The answer is that **programmers are lazy**, and thus they tend to prefer the +easiest solution they find. And in **Python**, a database isn't the simplest +solution for storing a bunch of structured data. This is what **dataset** is +going to change! -**dataset** provides two key functions that make using SQL databases in -Python a breeze: - -* A simple abstraction layer removes most direct SQL statements without - the necessity for a full ORM model - essentially, databases can be - used like a JSON file or NoSQL store. - -* Database contents can be exported (*frozen*) using a :doc:`sophisticated - plain file generator ` with JSON and CSV support. Exports can be configured - to include metadata and dynamic file names depending on the exported - data. The exporter can also be used as a command-line tool, ``datafreeze``. +**dataset** provides a simple abstraction layer removes most direct SQL +statements without the necessity for a full ORM model - essentially, databases +can be used like a JSON file or NoSQL store. A simple data loading script using **dataset** might look like this: @@ -55,8 +52,6 @@ Features * **Query helpers** for simple queries such as :py:meth:`all ` rows in a table or all :py:meth:`distinct ` values across a set of columns. * **Compatibility**: Being built on top of `SQLAlchemy `_, ``dataset`` works with all major databases, such as SQLite, PostgreSQL and MySQL. -* **Scripted exports**: Data can be exported based on a scripted - configuration, making the process easy and replicable. Contents -------- @@ -66,12 +61,14 @@ Contents install quickstart - freezefile api Contributors ------------ -``dataset`` is written and maintained by `Friedrich Lindenberg `_, `Gregor Aisch `_ and `Stefan Wehrmeyer `_. Its code is largely based on the preceding libraries `sqlaload `_ and datafreeze. And of course, we're standing on the `shoulders of giants `_. +``dataset`` is written and maintained by `Friedrich Lindenberg `_, +`Gregor Aisch `_ and `Stefan Wehrmeyer `_. +Its code is largely based on the preceding libraries `sqlaload `_ +and datafreeze. And of course, we're standing on the `shoulders of giants `_. Our cute little `naked mole rat `_ was drawn by `Johannes Koch `_. diff --git a/docs/install.rst b/docs/install.rst index b1d88de..00093a9 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -2,7 +2,8 @@ Installation Guide ================== -The easiest way is to install ``dataset`` from the `Python Package Index `_ using ``pip`` or ``easy_install``: +The easiest way is to install ``dataset`` from the `Python Package Index +`_ using ``pip`` or ``easy_install``: .. code-block:: bash @@ -16,4 +17,6 @@ To install it manually simply download the repository from Github: $ cd dataset/ $ python setup.py install -Depending on the type of database backend, you may also need to install a database specific driver package. For MySQL, this is ``MySQLdb``, for Postgres its ``psycopg2``. SQLite support is integrated into Python. +Depending on the type of database backend, you may also need to install a +database specific driver package. For MySQL, this is ``MySQLdb``, for Postgres +its ``psycopg2``. SQLite support is integrated into Python. diff --git a/docs/quickstart.rst b/docs/quickstart.rst index d6a9370..72114cf 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -30,8 +30,8 @@ so you can initialize database connection without explicitly passing an `URL`:: Depending on which database you're using, you may also have to install the database bindings to support that database. SQLite is included in -the Python core, but PostgreSQL requires ``psycopg2`` to be installed. -MySQL can be enabled by installing the ``mysql-db`` drivers. +the Python core, but PostgreSQL requires ``psycopg2`` to be installed. +MySQL can be enabled by installing the ``mysql-db`` drivers. Storing data @@ -110,7 +110,7 @@ database: Now, let's list all columns available in the table ``user``: >>> print(db['user'].columns) - [u'id', u'country', u'age', u'name', u'gender'] + [u'id', u'country', u'age', u'name', u'gender'] Using ``len()`` we can get the total number of rows in a table: @@ -156,7 +156,7 @@ results will be returned:: db = dataset.connect('sqlite:///mydatabase.db', row_type=stuf) Now contents will be returned in ``stuf`` objects (basically, ``dict`` -objects whose elements can be acessed as attributes (``item.name``) as well as +objects whose elements can be acessed as attributes (``item.name``) as well as by index (``item['name']``). Running custom SQL queries @@ -169,36 +169,10 @@ use the full power of SQL queries. Here's how you run them with ``dataset``:: for row in result: print(row['country'], row['c']) -The :py:meth:`query() ` method can also be used to +The :py:meth:`query() ` method can also be used to access the underlying `SQLAlchemy core API `_, which allows for the programmatic construction of more complex queries:: table = db['user'].table statement = table.select(table.c.name.like('%John%')) - result = db.query(statement) - - -Exporting data --------------- - -While playing around with our database in Python is a nice thing, they are -sometimes just a processing stage until we go on to use it in another -place, say in an interactive web application. To make this seamless, -``dataset`` supports serializing rows of data into static JSON and CSV files -such using the :py:meth:`freeze() ` function:: - - # export all users into a single JSON - result = db['users'].all() - dataset.freeze(result, format='json', filename='users.json') - -You can create one file per row by setting ``mode`` to "item":: - - # export one JSON file per user - dataset.freeze(result, format='json', filename='users/{{ id }}.json', mode='item') - -Since this is a common operation we made it available via command line -utility ``datafreeze``. Read more about the :doc:`freezefile markup `. - -.. code-block:: bash - - $ datafreeze freezefile.yaml + result = db.query(statement) diff --git a/setup.py b/setup.py index 1e1dee0..ddb8705 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ if sys.version_info[:2] <= (2, 6): setup( name='dataset', - version='0.8.0', + version='1.0.0', description="Toolkit for Python-based data processing.", long_description="", classifiers=[ @@ -34,14 +34,9 @@ setup( 'sqlalchemy >= 1.1.0', 'alembic >= 0.6.2', 'normality >= 0.3.9', - "PyYAML >= 3.10", "six >= 1.7.3" ] + py26_dependency, tests_require=[], test_suite='test', - entry_points={ - 'console_scripts': [ - 'datafreeze = dataset.freeze.app:main', - ] - } + entry_points={} ) diff --git a/test/Freezefile.yaml b/test/Freezefile.yaml deleted file mode 100644 index 2c36861..0000000 --- a/test/Freezefile.yaml +++ /dev/null @@ -1,32 +0,0 @@ -common: - - database: "postgresql://user:password@localhost/operational_database" - prefix: my_project/dumps/ - format: json - nested: - - property: "inner" - -exports: - - - query: "SELECT id, title, date FROM events" - filename: "index.json" - number: 5 - bool: true - nested: - - property: "override" - - - query: "SELECT id, title, date, country FROM events" - filename: "countries/{{country}}.csv" - format: csv - - - query: "SELECT * FROM events" - filename: "events/{{id}}.json" - mode: item - wrap: true - - - query: "SELECT * FROM events" - filename: "all.json" - format: tabson - diff --git a/test/test_freeze_app.py b/test/test_freeze_app.py deleted file mode 100644 index f9bee0d..0000000 --- a/test/test_freeze_app.py +++ /dev/null @@ -1,67 +0,0 @@ -# coding: utf-8 -""" -Test CLI following the recipe at http://dustinrcollins.com/testing-python-command-line-apps -""" -import os -import unittest -from tempfile import mkdtemp -from shutil import rmtree -from copy import copy - -from six import StringIO - -from dataset import connect -from dataset.util import FreezeException -from dataset.freeze.config import Configuration, Export -from dataset.freeze.app import create_parser, freeze_with_config, freeze_export -from .sample_data import TEST_DATA - - -class FreezeAppTestCase(unittest.TestCase): - """ - Base TestCase class, sets up a CLI parser - """ - def setUp(self): - parser = create_parser() - self.parser = parser - self.d = mkdtemp() - self.db_path = os.path.abspath(os.path.join(self.d, 'db.sqlite')) - self.db = 'sqlite:///' + self.db_path - _db = connect(self.db) - tbl = _db['weather'] - for i, row in enumerate(TEST_DATA): - _row = copy(row) - _row['count'] = i - _row['bool'] = True - _row['none'] = None - tbl.insert(_row) - - def tearDown(self): - rmtree(self.d, ignore_errors=True) - - def test_with_config(self): - cfg = Configuration(os.path.join(os.path.dirname(__file__), 'Freezefile.yaml')) - cfg.data['common']['database'] = self.db - cfg.data['common']['prefix'] = self.d - cfg.data['common']['query'] = 'SELECT * FROM weather' - cfg.data['exports'] = [ - {'filename': '{{identity:count}}.json', 'mode': 'item', 'transform': {'bool': 'identity'}}, - {'filename': 'weather.json', 'format': 'tabson'}, - {'filename': 'weather.csv', 'fileobj': StringIO(), 'format': 'csv'}, - {'filename': 'weather.json', 'fileobj': StringIO(), 'format': 'tabson'}, - {'filename': 'weather.json', 'format': 'tabson', 'callback': 'read'}, - {'skip': True}] - freeze_with_config(cfg, db=self.db) - self.assertRaises(FreezeException, freeze_export, Export(cfg.data['common'], {'query': 'SELECT * FROM notable'})) - - def test_unicode_path(self): - cfg = Configuration(os.path.join(os.path.dirname(__file__), 'Freezefile.yaml')) - cfg.data['common']['database'] = self.db - cfg.data['common']['prefix'] = os.path.join(self.d, u'über') - cfg.data['common']['query'] = 'SELECT * FROM weather' - cfg.data['exports'] = [{'filename': 'weather.csv', 'format': 'csv'}] - freeze_with_config(cfg, db=self.db) - - -if __name__ == '__main__': - unittest.main()