Remove datafreeze component, fixes #217

This commit is contained in:
Friedrich Lindenberg 2017-09-09 18:24:34 +02:00
parent cd091eadca
commit a049691749
24 changed files with 136 additions and 895 deletions

View File

@ -1,22 +0,0 @@
common:
database: "postgresql://user:password@localhost/operational_database"
prefix: my_project/dumps/
format: json
exports:
- query: "SELECT id, title, date FROM events"
filename: "index.json"
- query: "SELECT id, title, date, country FROM events"
filename: "countries/{{country}}.csv"
format: csv
- query: "SELECT * FROM events"
filename: "events/{{id}}.json"
mode: item
- query: "SELECT * FROM events"
filename: "all.json"
format: tabson

View File

@ -3,7 +3,6 @@ import warnings
from dataset.persistence.database import Database from dataset.persistence.database import Database
from dataset.persistence.table import Table from dataset.persistence.table import Table
from dataset.persistence.util import row_type from dataset.persistence.util import row_type
from dataset.freeze.app import freeze
# shut up useless SA warning: # shut up useless SA warning:
warnings.filterwarnings( warnings.filterwarnings(

View File

@ -14,10 +14,10 @@ from sqlalchemy.engine.reflection import Inspector
from alembic.migration import MigrationContext from alembic.migration import MigrationContext
from alembic.operations import Operations from alembic.operations import Operations
from dataset.persistence.table import Table from dataset.table import Table
from dataset.persistence.util import ResultIter, row_type, safe_url, QUERY_STEP from dataset.util import ResultIter, row_type, safe_url, QUERY_STEP
from dataset.persistence.util import normalize_table_name from dataset.util import normalize_table_name
from dataset.persistence.types import Types from dataset.types import Types
log = logging.getLogger(__name__) log = logging.getLogger(__name__)

View File

@ -1,167 +0,0 @@
import logging
import argparse
from sqlalchemy.exc import ProgrammingError, OperationalError
from dataset.util import FreezeException
from dataset.persistence.table import Table
from dataset.persistence.database import Database
from dataset.freeze.config import Configuration, Export
from dataset.freeze.format import get_serializer
log = logging.getLogger(__name__)
def create_parser():
parser = argparse.ArgumentParser(
prog='datafreeze',
description='Generate static JSON and CSV extracts from a SQL database.',
epilog='For further information, please check the documentation.')
parser.add_argument('config', metavar='CONFIG', type=str,
help='freeze file cofiguration')
parser.add_argument('--db', default=None,
help='Override the freezefile database URI')
return parser
def freeze(result, format='csv', filename='freeze.csv', fileobj=None,
prefix='.', mode='list', **kw):
"""
Perform a data export of a given result set. This is a very
flexible exporter, allowing for various output formats, metadata
assignment, and file name templating to dump each record (or a set
of records) into individual files.
::
result = db['person'].all()
dataset.freeze(result, format='json', filename='all-persons.json')
Instead of passing in the file name, you can also pass a file object::
result = db['person'].all()
fh = open('/dev/null', 'wb')
dataset.freeze(result, format='json', fileobj=fh)
Be aware that this will disable file name templating and store all
results to the same file.
If ``result`` is a table (rather than a result set), all records in
the table are exported (as if ``result.all()`` had been called).
freeze supports two values for ``mode``:
*list* (default)
The entire result set is dumped into a single file.
*item*
One file is created for each row in the result set.
You should set a ``filename`` for the exported file(s). If ``mode``
is set to *item* the function would generate one file per row. In
that case you can use values as placeholders in filenames::
dataset.freeze(res, mode='item', format='json',
filename='item-{{id}}.json')
The following output ``format`` s are supported:
*csv*
Comma-separated values, first line contains column names.
*json*
A JSON file containing a list of dictionaries for each row
in the table. If a ``callback`` is given, JSON with padding
(JSONP) will be generated.
*tabson*
Tabson is a smart combination of the space-efficiency of the
CSV and the parsability and structure of JSON.
You can pass additional named parameters specific to the used format.
As an example, you can freeze to minified JSON with the following:
dataset.freeze(res, format='json', indent=4, wrap=False,
filename='output.json')
*json* and *tabson*
*callback*:
if provided, generate a JSONP string using the given callback
function, i.e. something like `callback && callback({...})`
*indent*:
if *indent* is a non-negative integer (it is ``2`` by default
when you call `dataset.freeze`, and ``None`` via the
``datafreeze`` command), then JSON array elements and object
members will be pretty-printed with that indent level.
An indent level of 0 will only insert newlines.
``None`` is the most compact representation.
*meta*:
if *meta* is not ``None`` (default: ``{}``), it will be included
in the JSON output (for *json*, only if *wrap* is ``True``).
*wrap* (only for *json*):
if *wrap* is ``True`` (default), the JSON output is an object
of the form ``{"count": 2, "results": [...]}``.
if ``meta`` is not ``None``, a third property ``meta`` is added
to the wrapping object, with this value.
"""
kw.update({
'format': format,
'filename': filename,
'fileobj': fileobj,
'prefix': prefix,
'mode': mode
})
# Special cases when freezing comes from dataset.freeze
if format in ['json', 'tabson'] and 'indent' not in kw:
kw['indent'] = 2
records = result.all() if isinstance(result, Table) else result
return freeze_export(Export({}, kw), result=records)
def freeze_export(export, result=None):
try:
if result is None:
database = Database(export.get('database'))
query = database.query(export.get('query'))
else:
query = result
serializer_cls = get_serializer(export)
serializer = serializer_cls(export, query)
serializer.serialize()
except (OperationalError, ProgrammingError) as e:
raise FreezeException("Invalid query: %s" % e)
def freeze_with_config(config, db=None):
for export in config.exports:
if db is not None:
export.data['database'] = db
if export.skip:
log.info("Skipping: %s", export.name)
continue
log.info("Running: %s", export.name)
freeze_export(export)
def main(): # pragma: no cover
# Set up default logger.
logging.basicConfig(level=logging.INFO)
try:
parser = create_parser()
args = parser.parse_args()
freeze_with_config(Configuration(args.config), args.db)
except FreezeException as fe:
log.error(fe)
if __name__ == '__main__': # pragma: no cover
logging.basicConfig(level=logging.DEBUG)
main()

View File

@ -1,88 +0,0 @@
import json
import yaml
from six import text_type, PY3
from dataset.util import FreezeException
TRUISH = ['true', 'yes', '1', 'on']
DECODER = {
'json': json,
'yaml': yaml
}
def merge_overlay(data, overlay):
out = overlay.copy()
for k, v in data.items():
if isinstance(v, dict) and isinstance(out.get(k), dict):
v = merge_overlay(v, out.get(k))
out[k] = v
return out
class Configuration(object):
def __init__(self, file_name):
self.file_name = file_name
extension = file_name.rsplit('.', 1)[-1]
loader = DECODER.get(extension, json)
try:
if loader == json and PY3: # pragma: no cover
fh = open(file_name, encoding='utf8')
else:
fh = open(file_name, 'rb')
try:
self.data = loader.load(fh)
except ValueError as ve:
raise FreezeException("Invalid freeze file: %s" % ve)
fh.close()
except IOError as ioe:
raise FreezeException(text_type(ioe))
@property
def exports(self):
if not isinstance(self.data, dict):
raise FreezeException("The root element of the freeze file needs to be a hash")
if not isinstance(self.data.get('exports'), list):
raise FreezeException("The freeze file needs to have a list of exports")
common = self.data.get('common', {})
for export in self.data.get('exports'):
yield Export(common, export)
class Export(object):
def __init__(self, common, data):
self.data = merge_overlay(data, common)
def get(self, name, default=None):
return self.data.get(name, default)
def get_normalized(self, name, default=None):
value = self.get(name, default=default)
if value not in [None, default]:
value = text_type(value).lower().strip()
return value
def get_bool(self, name, default=False):
value = self.get_normalized(name)
if value is None:
return default
return value in TRUISH
def get_int(self, name, default=None):
value = self.get_normalized(name)
if value is None:
return default
return int(value)
@property
def skip(self):
return self.get_bool('skip')
@property
def name(self):
return self.get('name', self.get('query'))

View File

@ -1,14 +0,0 @@
from dataset.freeze.format.fjson import JSONSerializer
from dataset.freeze.format.fcsv import CSVSerializer
from dataset.freeze.format.ftabson import TabsonSerializer
SERIALIZERS = {
'json': JSONSerializer,
'csv': CSVSerializer,
'tabson': TabsonSerializer
}
def get_serializer(config):
serializer = config.get_normalized('format', 'json')
return SERIALIZERS.get(serializer)

View File

@ -1,93 +0,0 @@
import os
import re
import sys
import locale
from six import binary_type, text_type
from normality import slugify
from dataset.util import FreezeException
TMPL_KEY = re.compile("{{([^}]*)}}")
OPERATIONS = {
'identity': lambda x: x,
'lower': lambda x: text_type(x).lower(),
'slug': slugify
}
class Serializer(object):
def __init__(self, export, query):
self._encoding = locale.getpreferredencoding()
self.export = export
self.query = query
self._paths = []
self._get_basepath()
if export.get('filename') == '-':
export.data['fileobj'] = sys.stdout
self.fileobj = export.get('fileobj')
def _get_basepath(self):
prefix = self.export.get('prefix', '')
if isinstance(prefix, binary_type):
prefix = text_type(prefix, encoding=self._encoding)
prefix = os.path.abspath(prefix)
prefix = os.path.realpath(prefix)
self._prefix = prefix
filename = self.export.get('filename')
if isinstance(filename, binary_type):
filename = text_type(filename, encoding=self._encoding)
if filename is None:
raise FreezeException("No 'filename' is specified")
self._basepath = os.path.join(prefix, filename)
def _tmpl(self, data):
def repl(m):
op, key = 'identity', m.group(1)
if ':' in key:
op, key = key.split(':', 1)
return str(OPERATIONS.get(op)(data.get(key, '')))
path = TMPL_KEY.sub(repl, self._basepath)
return os.path.realpath(path)
def file_name(self, row):
# signal that there is a fileobj available:
if self.fileobj is not None:
return None
path = self._tmpl(row)
if path not in self._paths:
if not path.startswith(self._prefix):
raise FreezeException("Possible path escape detected.")
dn = os.path.dirname(path)
if not os.path.isdir(dn):
os.makedirs(dn)
self._paths.append(path)
return path
@property
def mode(self):
mode = self.export.get_normalized('mode', 'list')
if mode not in ['list', 'item']:
raise FreezeException("Invalid mode: %s" % mode)
return mode
@property
def wrap(self):
return self.export.get_bool('wrap', default=self.mode == 'list')
def serialize(self):
self.init()
transforms = self.export.get('transform', {})
for row in self.query:
for field, operation in transforms.items():
row[field] = OPERATIONS.get(operation)(row.get(field))
self.write(self.file_name(row), row)
self.close()

View File

@ -1,50 +0,0 @@
from __future__ import unicode_literals
import csv
from datetime import datetime, date
from six import PY3, text_type
from dataset.freeze.format.common import Serializer
def value_to_str(value):
if isinstance(value, (date, datetime)):
return text_type(value.isoformat())
if not PY3 and hasattr(value, 'encode'):
return value.encode('utf-8')
if value is None:
return ''
return value
class CSVSerializer(Serializer):
def init(self):
self.handles = {}
def write(self, path, result):
keys = list(result.keys())
if path not in self.handles:
# handle fileobj that has been passed in:
if path is not None:
if PY3: # pragma: no cover
fh = open(path, 'wt', encoding='utf8', newline='')
else:
fh = open(path, 'wb')
else:
fh = self.fileobj
writer = csv.writer(fh)
if PY3: # pragma: no cover
writer.writerow(keys)
else:
writer.writerow([value_to_str(k) for k in keys])
self.handles[path] = (writer, fh)
writer, fh = self.handles[path]
values = [value_to_str(result.get(k)) for k in keys]
writer.writerow(values)
def close(self):
for writer, fh in self.handles.values():
if fh != self.fileobj:
fh.close()

View File

@ -1,63 +0,0 @@
import json
from datetime import datetime, date
from collections import defaultdict, OrderedDict
from decimal import Decimal
from six import PY3
from dataset.freeze.format.common import Serializer
class JSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, (datetime, date)):
return obj.isoformat()
if isinstance(obj, Decimal):
return str(obj)
class JSONSerializer(Serializer):
def init(self):
self.buckets = defaultdict(list)
def write(self, path, result):
self.buckets[path].append(result)
def wrap(self, result):
if self.mode == 'item':
result = result[0]
if self.export.get_bool('wrap', True):
result = OrderedDict([
('count', len(result)),
('results', result),
])
meta = self.export.get('meta', {})
if meta is not None:
result['meta'] = meta
return result
def close(self):
for path, result in self.buckets.items():
result = self.wrap(result)
if self.fileobj is None:
if PY3: # pragma: no cover
fh = open(path, 'w', encoding='utf8')
else:
fh = open(path, 'wb')
else:
fh = self.fileobj
data = json.dumps(result,
cls=JSONEncoder,
indent=self.export.get_int('indent'))
callback = self.export.get('callback')
if callback:
data = "%s && %s(%s);" % (callback, callback, data)
fh.write(data)
if self.fileobj is None:
fh.close()

View File

@ -1,23 +0,0 @@
from dataset.freeze.format.fjson import JSONSerializer
class TabsonSerializer(JSONSerializer):
def wrap(self, result):
fields = []
data = []
if len(result):
keys = list(result[0].keys())
fields = [{'id': k} for k in keys]
for row in result:
d = [row.get(k) for k in keys]
data.append(d)
result = {
'count': len(data),
'fields': fields,
'data': data
}
meta = self.export.get('meta', {})
if meta is not None:
result['meta'] = meta
return result

View File

@ -1,102 +0,0 @@
try:
from urlparse import urlparse
except ImportError:
from urllib.parse import urlparse
try:
from collections import OrderedDict
except ImportError: # pragma: no cover
from ordereddict import OrderedDict
from six import string_types
from collections import Sequence
from hashlib import sha1
QUERY_STEP = 1000
row_type = OrderedDict
def convert_row(row_type, row):
if row is None:
return None
return row_type(row.items())
def iter_result_proxy(rp, step=None):
"""Iterate over the ResultProxy."""
while True:
if step is None:
chunk = rp.fetchall()
else:
chunk = rp.fetchmany(step)
if not chunk:
break
for row in chunk:
yield row
class ResultIter(object):
""" SQLAlchemy ResultProxies are not iterable to get a
list of dictionaries. This is to wrap them. """
def __init__(self, result_proxy, row_type=row_type, step=None):
self.row_type = row_type
self.result_proxy = result_proxy
self.keys = list(result_proxy.keys())
self._iter = iter_result_proxy(result_proxy, step=step)
def __next__(self):
return convert_row(self.row_type, next(self._iter))
next = __next__
def __iter__(self):
return self
def close(self):
self.result_proxy.close()
def normalize_column_name(name):
"""Check if a string is a reasonable thing to use as a column name."""
if not isinstance(name, string_types):
raise ValueError('%r is not a valid column name.' % name)
name = name.strip()
if not len(name) or '.' in name or '-' in name:
raise ValueError('%r is not a valid column name.' % name)
return name
def normalize_table_name(name):
"""Check if the table name is obviously invalid."""
if not isinstance(name, string_types):
raise ValueError("Invalid table name: %r" % name)
name = name.strip()
if not len(name):
raise ValueError("Invalid table name: %r" % name)
return name
def safe_url(url):
"""Remove password from printed connection URLs."""
parsed = urlparse(url)
if parsed.password is not None:
pwd = ':%s@' % parsed.password
url = url.replace(pwd, ':*****@')
return url
def index_name(table, columns):
"""Generate an artificial index name."""
sig = '||'.join(columns)
key = sha1(sig.encode('utf-8')).hexdigest()[:16]
return 'ix_%s_%s' % (table, key)
def ensure_tuple(obj):
"""Try and make the given argument into a tuple."""
if obj is None:
return tuple()
if isinstance(obj, Sequence) and not isinstance(obj, string_types):
return tuple(obj)
return obj,

View File

@ -9,11 +9,10 @@ from sqlalchemy import func, select, false
from sqlalchemy.schema import Table as SQLATable from sqlalchemy.schema import Table as SQLATable
from sqlalchemy.exc import NoSuchTableError from sqlalchemy.exc import NoSuchTableError
from dataset.persistence.types import Types from dataset.types import Types
from dataset.persistence.util import normalize_column_name, index_name from dataset.util import normalize_column_name, index_name, ensure_tuple
from dataset.persistence.util import ensure_tuple, ResultIter, QUERY_STEP from dataset.util import DatasetException, ResultIter, QUERY_STEP
from dataset.persistence.util import normalize_table_name from dataset.util import normalize_table_name
from dataset.util import DatasetException
log = logging.getLogger(__name__) log = logging.getLogger(__name__)

View File

@ -1,12 +1,105 @@
# coding: utf-8 try:
import re from urlparse import urlparse
except ImportError:
from urllib.parse import urlparse
SLUG_REMOVE = re.compile(r'[,\s\.\(\)/\\;:]*') try:
from collections import OrderedDict
except ImportError: # pragma: no cover
from ordereddict import OrderedDict
from six import string_types
from collections import Sequence
from hashlib import sha1
QUERY_STEP = 1000
row_type = OrderedDict
class DatasetException(Exception): class DatasetException(Exception):
pass pass
class FreezeException(DatasetException): def convert_row(row_type, row):
pass if row is None:
return None
return row_type(row.items())
def iter_result_proxy(rp, step=None):
"""Iterate over the ResultProxy."""
while True:
if step is None:
chunk = rp.fetchall()
else:
chunk = rp.fetchmany(step)
if not chunk:
break
for row in chunk:
yield row
class ResultIter(object):
""" SQLAlchemy ResultProxies are not iterable to get a
list of dictionaries. This is to wrap them. """
def __init__(self, result_proxy, row_type=row_type, step=None):
self.row_type = row_type
self.result_proxy = result_proxy
self.keys = list(result_proxy.keys())
self._iter = iter_result_proxy(result_proxy, step=step)
def __next__(self):
return convert_row(self.row_type, next(self._iter))
next = __next__
def __iter__(self):
return self
def close(self):
self.result_proxy.close()
def normalize_column_name(name):
"""Check if a string is a reasonable thing to use as a column name."""
if not isinstance(name, string_types):
raise ValueError('%r is not a valid column name.' % name)
name = name.strip()
if not len(name) or '.' in name or '-' in name:
raise ValueError('%r is not a valid column name.' % name)
return name
def normalize_table_name(name):
"""Check if the table name is obviously invalid."""
if not isinstance(name, string_types):
raise ValueError("Invalid table name: %r" % name)
name = name.strip()
if not len(name):
raise ValueError("Invalid table name: %r" % name)
return name
def safe_url(url):
"""Remove password from printed connection URLs."""
parsed = urlparse(url)
if parsed.password is not None:
pwd = ':%s@' % parsed.password
url = url.replace(pwd, ':*****@')
return url
def index_name(table, columns):
"""Generate an artificial index name."""
sig = '||'.join(columns)
key = sha1(sig.encode('utf-8')).hexdigest()[:16]
return 'ix_%s_%s' % (table, key)
def ensure_tuple(obj):
"""Try and make the given argument into a tuple."""
if obj is None:
return tuple()
if isinstance(obj, Sequence) and not isinstance(obj, string_types):
return tuple(obj)
return obj,

View File

@ -48,9 +48,9 @@ copyright = u'2013-2015, Friedrich Lindenberg, Gregor Aisch, Stefan Wehrmeyer'
# built documents. # built documents.
# #
# The short X.Y version. # The short X.Y version.
version = '0.6' version = '1.0'
# The full version, including alpha/beta/rc tags. # The full version, including alpha/beta/rc tags.
release = '0.6.0' release = '1.0.0'
# The language for content autogenerated by Sphinx. Refer to documentation # The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages. # for a list of supported languages.

View File

@ -1,98 +0,0 @@
Freezefiles and the ``datafreeze`` command
==========================================
``datafreeze`` creates static extracts of SQL databases for use in interactive
web applications. SQL databases are a great way to manage relational data, but
exposing them on the web to drive data apps can be cumbersome. Often, the
capacities of a proper database are not actually required, a few static JSON
files and a bit of JavaScript can have the same effect. Still, exporting JSON
by hand (or with a custom script) can also become a messy process.
With ``datafreeze``, exports are scripted in a Makefile-like description, making them simple to repeat and replicate.
Basic Usage
-----------
Calling DataFreeze is simple, the application is called with a
freeze file as its argument:
.. code-block:: bash
datafreeze Freezefile.yaml
Freeze files can be either written in JSON or in YAML. The database URI
indicated in the Freezefile can also be overridden via the command line:
datafreeze --db sqlite:///foo.db Freezefile.yaml
Example Freezefile.yaml
-----------------------
A freeze file is composed of a set of scripted queries and
specifications on how their output is to be handled. An example could look
like this:
.. code-block:: yaml
common:
database: "postgresql://user:password@localhost/operational_database"
prefix: my_project/dumps/
format: json
exports:
- query: "SELECT id, title, date FROM events"
filename: "index.json"
- query: "SELECT id, title, date, country FROM events"
filename: "countries/{{country}}.csv"
format: csv
- query: "SELECT * FROM events"
filename: "events/{{id}}.json"
mode: item
- query: "SELECT * FROM events"
filename: "all.json"
format: tabson
An identical JSON configuration can be found in this repository.
Options in detail
-----------------
The freeze file has two main sections, ``common`` and ``exports``. Both
accept many of the same arguments, with ``exports`` specifying a list of
exports while ``common`` defines some shared properties, such as the
database connection string.
The following options are recognized:
* ``database`` is a database URI, including the database type, username
and password, hostname and database name. Valid database types include
``sqlite``, ``mysql`` and ``postgresql`` (requires psycopg2).
* ``prefix`` specifies a common root directory for all extracted files.
* ``format`` identifies the format to be generated, ``csv``, ``json`` and
``tabson`` are supported. ``tabson`` is a condensed JSON
representation in which rows are not represented by objects but by
lists of values.
* ``query`` needs to be a valid SQL statement. All selected fields will
become keys or columns in the output, so it may make sense to define
proper aliases if any overlap is to be expected.
* ``mode`` specifies whether the query output is to be combined into a
single file (``list``) or whether a file should be generated for each
result row (``item``).
* ``filename`` is the output file name, appended to ``prefix``. All
occurences of ``{{field}}`` are expanded to a fields value to allow the
generation of file names e.g. by primary key. In list mode, templating
can be used to group records into several buckets, e.g. by country or
category.
* ``wrap`` can be used to specify whether the output should be wrapped
in a ``results`` hash in JSON output. This defaults to ``true`` for
``list``-mode output and ``false`` for ``item``-mode.

View File

@ -10,22 +10,19 @@ dataset: databases for lazy people
:hidden: :hidden:
Although managing data in relational database has plenty of benefits, they're rarely used in day-to-day work with small to medium scale datasets. But why is that? Why do we see an awful lot of data stored in static files in CSV or JSON format, even though they are hard Although managing data in relational database has plenty of benefits, they're
to query and update incrementally? rarely used in day-to-day work with small to medium scale datasets. But why is
that? Why do we see an awful lot of data stored in static files in CSV or JSON
format, even though they are hard to query and update incrementally?
The answer is that **programmers are lazy**, and thus they tend to prefer the easiest solution they find. And in **Python**, a database isn't the simplest solution for storing a bunch of structured data. This is what **dataset** is going to change! The answer is that **programmers are lazy**, and thus they tend to prefer the
easiest solution they find. And in **Python**, a database isn't the simplest
solution for storing a bunch of structured data. This is what **dataset** is
going to change!
**dataset** provides two key functions that make using SQL databases in **dataset** provides a simple abstraction layer removes most direct SQL
Python a breeze: statements without the necessity for a full ORM model - essentially, databases
can be used like a JSON file or NoSQL store.
* A simple abstraction layer removes most direct SQL statements without
the necessity for a full ORM model - essentially, databases can be
used like a JSON file or NoSQL store.
* Database contents can be exported (*frozen*) using a :doc:`sophisticated
plain file generator <freezefile>` with JSON and CSV support. Exports can be configured
to include metadata and dynamic file names depending on the exported
data. The exporter can also be used as a command-line tool, ``datafreeze``.
A simple data loading script using **dataset** might look like this: A simple data loading script using **dataset** might look like this:
@ -55,8 +52,6 @@ Features
* **Query helpers** for simple queries such as :py:meth:`all <dataset.Table.all>` rows in a table or * **Query helpers** for simple queries such as :py:meth:`all <dataset.Table.all>` rows in a table or
all :py:meth:`distinct <dataset.Table.distinct>` values across a set of columns. all :py:meth:`distinct <dataset.Table.distinct>` values across a set of columns.
* **Compatibility**: Being built on top of `SQLAlchemy <http://www.sqlalchemy.org/>`_, ``dataset`` works with all major databases, such as SQLite, PostgreSQL and MySQL. * **Compatibility**: Being built on top of `SQLAlchemy <http://www.sqlalchemy.org/>`_, ``dataset`` works with all major databases, such as SQLite, PostgreSQL and MySQL.
* **Scripted exports**: Data can be exported based on a scripted
configuration, making the process easy and replicable.
Contents Contents
-------- --------
@ -66,12 +61,14 @@ Contents
install install
quickstart quickstart
freezefile
api api
Contributors Contributors
------------ ------------
``dataset`` is written and maintained by `Friedrich Lindenberg <https://github.com/pudo>`_, `Gregor Aisch <https://github.com/gka>`_ and `Stefan Wehrmeyer <https://github.com/stefanw>`_. Its code is largely based on the preceding libraries `sqlaload <https://github.com/okfn/sqlaload>`_ and datafreeze. And of course, we're standing on the `shoulders of giants <http://www.sqlalchemy.org/>`_. ``dataset`` is written and maintained by `Friedrich Lindenberg <https://github.com/pudo>`_,
`Gregor Aisch <https://github.com/gka>`_ and `Stefan Wehrmeyer <https://github.com/stefanw>`_.
Its code is largely based on the preceding libraries `sqlaload <https://github.com/okfn/sqlaload>`_
and datafreeze. And of course, we're standing on the `shoulders of giants <http://www.sqlalchemy.org/>`_.
Our cute little `naked mole rat <http://www.youtube.com/watch?feature=player_detailpage&v=A5DcOEzW1wA#t=14s>`_ was drawn by `Johannes Koch <http://chechuchape.com/>`_. Our cute little `naked mole rat <http://www.youtube.com/watch?feature=player_detailpage&v=A5DcOEzW1wA#t=14s>`_ was drawn by `Johannes Koch <http://chechuchape.com/>`_.

View File

@ -2,7 +2,8 @@
Installation Guide Installation Guide
================== ==================
The easiest way is to install ``dataset`` from the `Python Package Index <https://pypi.python.org/pypi/dataset/>`_ using ``pip`` or ``easy_install``: The easiest way is to install ``dataset`` from the `Python Package Index
<https://pypi.python.org/pypi/dataset/>`_ using ``pip`` or ``easy_install``:
.. code-block:: bash .. code-block:: bash
@ -16,4 +17,6 @@ To install it manually simply download the repository from Github:
$ cd dataset/ $ cd dataset/
$ python setup.py install $ python setup.py install
Depending on the type of database backend, you may also need to install a database specific driver package. For MySQL, this is ``MySQLdb``, for Postgres its ``psycopg2``. SQLite support is integrated into Python. Depending on the type of database backend, you may also need to install a
database specific driver package. For MySQL, this is ``MySQLdb``, for Postgres
its ``psycopg2``. SQLite support is integrated into Python.

View File

@ -30,8 +30,8 @@ so you can initialize database connection without explicitly passing an `URL`::
Depending on which database you're using, you may also have to install Depending on which database you're using, you may also have to install
the database bindings to support that database. SQLite is included in the database bindings to support that database. SQLite is included in
the Python core, but PostgreSQL requires ``psycopg2`` to be installed. the Python core, but PostgreSQL requires ``psycopg2`` to be installed.
MySQL can be enabled by installing the ``mysql-db`` drivers. MySQL can be enabled by installing the ``mysql-db`` drivers.
Storing data Storing data
@ -110,7 +110,7 @@ database:
Now, let's list all columns available in the table ``user``: Now, let's list all columns available in the table ``user``:
>>> print(db['user'].columns) >>> print(db['user'].columns)
[u'id', u'country', u'age', u'name', u'gender'] [u'id', u'country', u'age', u'name', u'gender']
Using ``len()`` we can get the total number of rows in a table: Using ``len()`` we can get the total number of rows in a table:
@ -156,7 +156,7 @@ results will be returned::
db = dataset.connect('sqlite:///mydatabase.db', row_type=stuf) db = dataset.connect('sqlite:///mydatabase.db', row_type=stuf)
Now contents will be returned in ``stuf`` objects (basically, ``dict`` Now contents will be returned in ``stuf`` objects (basically, ``dict``
objects whose elements can be acessed as attributes (``item.name``) as well as objects whose elements can be acessed as attributes (``item.name``) as well as
by index (``item['name']``). by index (``item['name']``).
Running custom SQL queries Running custom SQL queries
@ -169,36 +169,10 @@ use the full power of SQL queries. Here's how you run them with ``dataset``::
for row in result: for row in result:
print(row['country'], row['c']) print(row['country'], row['c'])
The :py:meth:`query() <dataset.Table.query>` method can also be used to The :py:meth:`query() <dataset.Table.query>` method can also be used to
access the underlying `SQLAlchemy core API <http://docs.sqlalchemy.org/en/latest/orm/query.html#the-query-object>`_, which allows for the access the underlying `SQLAlchemy core API <http://docs.sqlalchemy.org/en/latest/orm/query.html#the-query-object>`_, which allows for the
programmatic construction of more complex queries:: programmatic construction of more complex queries::
table = db['user'].table table = db['user'].table
statement = table.select(table.c.name.like('%John%')) statement = table.select(table.c.name.like('%John%'))
result = db.query(statement) result = db.query(statement)
Exporting data
--------------
While playing around with our database in Python is a nice thing, they are
sometimes just a processing stage until we go on to use it in another
place, say in an interactive web application. To make this seamless,
``dataset`` supports serializing rows of data into static JSON and CSV files
such using the :py:meth:`freeze() <dataset.freeze>` function::
# export all users into a single JSON
result = db['users'].all()
dataset.freeze(result, format='json', filename='users.json')
You can create one file per row by setting ``mode`` to "item"::
# export one JSON file per user
dataset.freeze(result, format='json', filename='users/{{ id }}.json', mode='item')
Since this is a common operation we made it available via command line
utility ``datafreeze``. Read more about the :doc:`freezefile markup <freezefile>`.
.. code-block:: bash
$ datafreeze freezefile.yaml

View File

@ -8,7 +8,7 @@ if sys.version_info[:2] <= (2, 6):
setup( setup(
name='dataset', name='dataset',
version='0.8.0', version='1.0.0',
description="Toolkit for Python-based data processing.", description="Toolkit for Python-based data processing.",
long_description="", long_description="",
classifiers=[ classifiers=[
@ -34,14 +34,9 @@ setup(
'sqlalchemy >= 1.1.0', 'sqlalchemy >= 1.1.0',
'alembic >= 0.6.2', 'alembic >= 0.6.2',
'normality >= 0.3.9', 'normality >= 0.3.9',
"PyYAML >= 3.10",
"six >= 1.7.3" "six >= 1.7.3"
] + py26_dependency, ] + py26_dependency,
tests_require=[], tests_require=[],
test_suite='test', test_suite='test',
entry_points={ entry_points={}
'console_scripts': [
'datafreeze = dataset.freeze.app:main',
]
}
) )

View File

@ -1,32 +0,0 @@
common:
database: "postgresql://user:password@localhost/operational_database"
prefix: my_project/dumps/
format: json
nested:
property: "inner"
exports:
- query: "SELECT id, title, date FROM events"
filename: "index.json"
number: 5
bool: true
nested:
property: "override"
- query: "SELECT id, title, date, country FROM events"
filename: "countries/{{country}}.csv"
format: csv
- query: "SELECT * FROM events"
filename: "events/{{id}}.json"
mode: item
wrap: true
- query: "SELECT * FROM events"
filename: "all.json"
format: tabson

View File

@ -1,67 +0,0 @@
# coding: utf-8
"""
Test CLI following the recipe at http://dustinrcollins.com/testing-python-command-line-apps
"""
import os
import unittest
from tempfile import mkdtemp
from shutil import rmtree
from copy import copy
from six import StringIO
from dataset import connect
from dataset.util import FreezeException
from dataset.freeze.config import Configuration, Export
from dataset.freeze.app import create_parser, freeze_with_config, freeze_export
from .sample_data import TEST_DATA
class FreezeAppTestCase(unittest.TestCase):
"""
Base TestCase class, sets up a CLI parser
"""
def setUp(self):
parser = create_parser()
self.parser = parser
self.d = mkdtemp()
self.db_path = os.path.abspath(os.path.join(self.d, 'db.sqlite'))
self.db = 'sqlite:///' + self.db_path
_db = connect(self.db)
tbl = _db['weather']
for i, row in enumerate(TEST_DATA):
_row = copy(row)
_row['count'] = i
_row['bool'] = True
_row['none'] = None
tbl.insert(_row)
def tearDown(self):
rmtree(self.d, ignore_errors=True)
def test_with_config(self):
cfg = Configuration(os.path.join(os.path.dirname(__file__), 'Freezefile.yaml'))
cfg.data['common']['database'] = self.db
cfg.data['common']['prefix'] = self.d
cfg.data['common']['query'] = 'SELECT * FROM weather'
cfg.data['exports'] = [
{'filename': '{{identity:count}}.json', 'mode': 'item', 'transform': {'bool': 'identity'}},
{'filename': 'weather.json', 'format': 'tabson'},
{'filename': 'weather.csv', 'fileobj': StringIO(), 'format': 'csv'},
{'filename': 'weather.json', 'fileobj': StringIO(), 'format': 'tabson'},
{'filename': 'weather.json', 'format': 'tabson', 'callback': 'read'},
{'skip': True}]
freeze_with_config(cfg, db=self.db)
self.assertRaises(FreezeException, freeze_export, Export(cfg.data['common'], {'query': 'SELECT * FROM notable'}))
def test_unicode_path(self):
cfg = Configuration(os.path.join(os.path.dirname(__file__), 'Freezefile.yaml'))
cfg.data['common']['database'] = self.db
cfg.data['common']['prefix'] = os.path.join(self.d, u'über')
cfg.data['common']['query'] = 'SELECT * FROM weather'
cfg.data['exports'] = [{'filename': 'weather.csv', 'format': 'csv'}]
freeze_with_config(cfg, db=self.db)
if __name__ == '__main__':
unittest.main()