562 lines
19 KiB
Python
Raw Normal View History

import logging
from sqlalchemy.sql import and_, expression
from sqlalchemy.sql.expression import ClauseElement
from sqlalchemy.schema import Column, Index
2017-01-29 15:45:05 +01:00
from sqlalchemy import func, select, false
2017-09-02 23:05:50 +02:00
from sqlalchemy.engine.reflection import Inspector
2017-09-02 23:05:50 +02:00
from dataset.persistence.util import normalize_column_name, index_name
2015-01-06 21:41:01 +01:00
from dataset.persistence.util import ResultIter
2013-04-05 00:31:13 +02:00
from dataset.util import DatasetException
log = logging.getLogger(__name__)
2013-04-01 18:05:41 +02:00
class Table(object):
2016-04-10 09:25:14 +02:00
"""Represents a table in a database and exposes common operations."""
PRIMARY_DEFAULT = 'id'
2013-04-01 18:05:41 +02:00
def __init__(self, database, table):
2016-04-10 09:25:14 +02:00
"""Initialise the table from database schema."""
2013-04-01 18:05:41 +02:00
self.database = database
2017-09-02 22:35:29 +02:00
self.name = table.name
2013-04-01 18:05:41 +02:00
self.table = table
2013-04-05 00:31:13 +02:00
self._is_dropped = False
2017-09-02 23:05:50 +02:00
self._indexes = []
@property
def exists(self):
"""Check to see if the table currently exists in the database."""
if self.table is not None:
return True
return self.name in self.database
2013-04-01 18:05:41 +02:00
2013-04-03 00:51:33 +02:00
@property
def columns(self):
2016-04-10 09:25:14 +02:00
"""Get a listing of all columns that exist in the table."""
2017-09-02 23:05:50 +02:00
if not self.exists:
2017-09-02 22:35:29 +02:00
return []
return self.table.columns.keys()
2013-04-03 00:51:33 +02:00
2013-04-01 18:05:41 +02:00
def drop(self):
2013-04-03 00:51:33 +02:00
"""
2016-04-10 09:25:14 +02:00
Drop the table from the database.
2013-04-02 11:10:29 +02:00
2016-04-10 09:25:14 +02:00
Delete both the schema and all the contents within it.
2013-04-05 00:49:13 +02:00
Note: the object will raise an Exception if you use it after
dropping the table. If you want to re-create the table, make
sure to get a fresh instance from the :py:class:`Database <dataset.Database>`.
2013-04-03 00:51:33 +02:00
"""
self._check_dropped()
2017-09-02 22:35:29 +02:00
with self.database.lock:
2017-09-02 20:33:11 +02:00
self.table.drop(self.database.executable, checkfirst=True)
2017-09-02 22:35:29 +02:00
# self.database._tables.pop(self.name, None)
2017-09-02 20:33:11 +02:00
self.table = None
2013-04-01 18:05:41 +02:00
2013-04-05 00:31:13 +02:00
def _check_dropped(self):
2017-09-02 22:35:29 +02:00
# self.table = self.database._reflect_table(self.name)
2017-09-02 20:33:11 +02:00
if self.table is None:
2017-09-02 22:35:29 +02:00
raise DatasetException('The table has been dropped.')
2013-04-05 00:31:13 +02:00
def insert(self, row, ensure=None, types=None):
2013-04-03 00:51:33 +02:00
"""
Add a row (type: dict) by inserting it into the table.
2016-04-10 09:25:14 +02:00
2013-04-02 00:10:07 +02:00
If ``ensure`` is set, any of the keys of the row are not
2013-04-02 11:10:29 +02:00
table columns, they will be created automatically.
2013-04-02 00:10:07 +02:00
During column creation, ``types`` will be checked for a key
2013-04-02 11:10:29 +02:00
matching the name of a column to be created, and the given
2013-04-02 00:10:07 +02:00
SQLAlchemy column type will be used. Otherwise, the type is
2013-04-03 22:27:06 +02:00
guessed from the row value, defaulting to a simple unicode
2013-04-02 13:44:14 +02:00
field.
::
2013-04-03 00:51:33 +02:00
2013-04-04 20:05:27 +02:00
data = dict(title='I am a banana!')
table.insert(data)
2015-01-07 15:06:34 +01:00
Returns the inserted row's primary key.
2013-04-02 13:44:14 +02:00
"""
2013-04-05 00:31:13 +02:00
self._check_dropped()
ensure = self.database.ensure_schema if ensure is None else ensure
if ensure:
self._ensure_columns(row, types=types)
2016-07-03 11:09:39 +02:00
else:
row = self._prune_row(row)
2013-05-10 22:58:23 +02:00
res = self.database.executable.execute(self.table.insert(row))
if len(res.inserted_primary_key) > 0:
return res.inserted_primary_key[0]
def insert_ignore(self, row, keys, ensure=None, types=None):
"""
Add a row (type: dict) into the table if the row does not exist.
If rows with matching ``keys`` exist they will be added to the table.
Setting ``ensure`` results in automatically creating missing columns,
i.e., keys of the row are not table columns.
During column creation, ``types`` will be checked for a key
matching the name of a column to be created, and the given
SQLAlchemy column type will be used. Otherwise, the type is
guessed from the row value, defaulting to a simple unicode
field.
::
data = dict(id=10, title='I am a banana!')
table.insert_ignore(data, ['id'])
"""
2016-07-03 11:09:39 +02:00
row, res = self._upsert_pre_check(row, keys, ensure)
if res is None:
2016-04-22 18:57:35 +02:00
return self.insert(row, ensure=ensure, types=types)
else:
return False
def insert_many(self, rows, chunk_size=1000, ensure=None, types=None):
2013-04-04 15:43:05 +02:00
"""
2016-04-10 09:25:14 +02:00
Add many rows at a time.
This is significantly faster than adding them one by one. Per default
the rows are processed in chunks of 1000 per commit, unless you specify
a different ``chunk_size``.
2013-04-04 19:44:28 +02:00
See :py:meth:`insert() <dataset.Table.insert>` for details on
the other parameters.
2013-04-04 15:43:05 +02:00
::
rows = [dict(name='Dolly')] * 10000
table.insert_many(rows)
"""
ensure = self.database.ensure_schema if ensure is None else ensure
2013-04-04 15:43:05 +02:00
def _process_chunk(chunk):
if ensure:
for row in chunk:
self._ensure_columns(row, types=types)
2016-07-03 11:09:39 +02:00
else:
chunk = [self._prune_row(r) for r in chunk]
2013-04-04 15:43:05 +02:00
self.table.insert().execute(chunk)
2013-04-05 00:31:13 +02:00
self._check_dropped()
2014-01-25 21:45:30 +01:00
2013-04-04 15:43:05 +02:00
chunk = []
for i, row in enumerate(rows, start=1):
2013-04-04 15:43:05 +02:00
chunk.append(row)
if i % chunk_size == 0:
2013-04-04 15:43:05 +02:00
_process_chunk(chunk)
chunk = []
if chunk:
2013-04-04 15:43:05 +02:00
_process_chunk(chunk)
def update(self, row, keys, ensure=None, types=None):
2013-04-03 00:51:33 +02:00
"""
2016-04-10 09:25:14 +02:00
Update a row in the table.
The update is managed via the set of column names stated in ``keys``:
they will be used as filters for the data to be updated, using the values
2013-04-02 11:10:29 +02:00
in ``row``.
::
2013-04-02 00:10:07 +02:00
2013-04-02 11:10:29 +02:00
# update all entries with id matching 10, setting their title columns
2013-04-02 00:10:07 +02:00
data = dict(id=10, title='I am a banana!')
table.update(data, ['id'])
2013-04-02 11:10:29 +02:00
If keys in ``row`` update columns not present in the table,
they will be created based on the settings of ``ensure`` and
2013-04-04 19:44:28 +02:00
``types``, matching the behavior of :py:meth:`insert() <dataset.Table.insert>`.
2013-04-02 00:10:07 +02:00
"""
2017-09-02 22:35:29 +02:00
self._check_dropped()
# check whether keys arg is a string and format as a list
if not isinstance(keys, (list, tuple)):
keys = [keys]
2014-01-25 21:45:30 +01:00
if not keys or len(keys) == len(row):
return False
2013-04-02 11:10:29 +02:00
clause = [(u, row.get(u)) for u in keys]
ensure = self.database.ensure_schema if ensure is None else ensure
if ensure:
self._ensure_columns(row, types=types)
2016-07-03 11:09:39 +02:00
else:
row = self._prune_row(row)
# Don't update the key itself, so remove any keys from the row dict
2013-12-05 11:09:29 +01:00
clean_row = row.copy()
2013-12-04 17:55:42 +01:00
for key in keys:
if key in clean_row.keys():
del clean_row[key]
try:
filters = self._args_to_clause(dict(clause))
stmt = self.table.update(filters, clean_row)
2013-05-10 22:58:23 +02:00
rp = self.database.executable.execute(stmt)
return rp.rowcount
2013-04-03 00:56:07 +02:00
except KeyError:
return 0
def _upsert_pre_check(self, row, keys, ensure):
# check whether keys arg is a string and format as a list
2016-07-03 11:09:39 +02:00
if not isinstance(keys, (list, tuple)):
keys = [keys]
self._check_dropped()
2016-07-03 11:09:39 +02:00
ensure = self.database.ensure_schema if ensure is None else ensure
if ensure:
self.create_index(keys)
else:
row = self._prune_row(row)
2016-07-03 11:09:39 +02:00
filters = {}
for key in keys:
filters[key] = row.get(key)
return row, self.find_one(**filters)
def upsert(self, row, keys, ensure=None, types=None):
2013-04-03 00:51:33 +02:00
"""
2016-04-10 09:25:14 +02:00
An UPSERT is a smart combination of insert and update.
If rows with matching ``keys`` exist they will be updated, otherwise a
new row is inserted in the table.
2013-04-02 11:10:29 +02:00
::
2013-04-03 00:51:33 +02:00
2013-04-02 11:10:29 +02:00
data = dict(id=10, title='I am a banana!')
table.upsert(data, ['id'])
"""
2016-07-03 11:09:39 +02:00
row, res = self._upsert_pre_check(row, keys, ensure)
if res is None:
return self.insert(row, ensure=ensure, types=types)
else:
row_count = self.update(row, keys, ensure=ensure, types=types)
try:
2016-04-23 13:59:25 +02:00
result = (row_count > 0, res['id'])[row_count == 1]
except KeyError:
result = row_count > 0
return result
def delete(self, *_clauses, **_filter):
2016-04-10 09:25:14 +02:00
"""
Delete rows from the table.
Keyword arguments can be used to add column-based filters. The filter
criterion will always be equality:
2013-04-02 00:20:02 +02:00
.. code-block:: python
table.delete(place='Berlin')
2013-04-04 20:08:39 +02:00
If no arguments are given, all records are deleted.
2013-04-02 00:20:02 +02:00
"""
2013-04-05 00:31:13 +02:00
self._check_dropped()
2017-09-02 23:05:50 +02:00
if not self.exists:
return
if _filter or _clauses:
q = self._args_to_clause(_filter, clauses=_clauses)
2013-04-05 11:54:12 +02:00
stmt = self.table.delete(q)
else:
stmt = self.table.delete()
2014-01-31 21:42:18 +01:00
rows = self.database.executable.execute(stmt)
2014-01-31 21:53:30 +01:00
return rows.rowcount > 0
def _has_column(self, column):
2017-09-02 20:33:11 +02:00
return normalize_column_name(column) in self.columns
def _ensure_columns(self, row, types=None):
# Keep order of inserted columns
for column in row.keys():
if self._has_column(column):
continue
if types is not None and column in types:
_type = types[column]
else:
_type = self.database.types.guess(row[column])
2013-04-02 11:10:29 +02:00
log.debug("Creating column: %s (%s) on %r" % (column,
_type,
self.table.name))
self.create_column(column, _type)
2017-09-02 23:05:50 +02:00
def _prune_row(self, row):
"""Remove keys from row not in column set."""
# normalize keys
row = {normalize_column_name(k): v for k, v in row.items()}
# filter out keys not in column set
return {k: row[k] for k in row if k in self.columns}
def _args_to_clause(self, args, ensure=None, clauses=()):
ensure = self.database.ensure_schema if ensure is None else ensure
if ensure:
self._ensure_columns(args)
clauses = list(clauses)
for k, v in args.items():
if not self._has_column(k):
2016-04-22 15:45:10 +02:00
clauses.append(false())
elif isinstance(v, (list, tuple)):
clauses.append(self.table.c[k].in_(v))
else:
clauses.append(self.table.c[k] == v)
return and_(*clauses)
def create_column(self, name, type):
2013-04-03 01:48:26 +02:00
"""
2016-07-28 00:23:38 +02:00
Explicitly create a new column ``name`` of a specified type.
2016-04-10 09:25:14 +02:00
2013-04-03 22:27:06 +02:00
``type`` must be a `SQLAlchemy column type <http://docs.sqlalchemy.org/en/rel_0_8/core/types.html>`_.
2013-04-03 01:48:26 +02:00
::
table.create_column('created_at', db.types.datetime)
2013-04-03 01:48:26 +02:00
"""
2013-04-05 00:31:13 +02:00
self._check_dropped()
2017-09-02 22:35:29 +02:00
with self.database.lock:
2017-09-02 20:33:11 +02:00
name = normalize_column_name(name)
if name in self.columns:
log.debug("Column exists: %s" % name)
return
self.database.op.add_column(
self.table.name,
Column(name, type),
self.table.schema
)
self.table = self.database._reflect_table(self.table.name)
def create_column_by_example(self, name, value):
"""
Explicitly create a new column ``name`` with a type that is appropriate to store
the given example ``value``. The type is guessed in the same way as for the
insert method with ``ensure=True``. If a column of the same name already exists,
no action is taken, even if it is not of the type we would have created.
table.create_column_by_example('length', 4.2)
"""
type_ = self.database.types.guess(value)
self.create_column(name, type_)
2013-12-04 20:44:01 +01:00
def drop_column(self, name):
2017-09-02 23:05:50 +02:00
"""Drop the column ``name``.
2016-04-10 09:25:14 +02:00
2013-12-04 20:44:01 +01:00
::
table.drop_column('created_at')
"""
2016-01-18 11:14:30 +01:00
if self.database.engine.dialect.name == 'sqlite':
raise NotImplementedError("SQLite does not support dropping columns.")
2017-09-02 23:05:50 +02:00
if not self.exists:
return
2013-12-04 20:44:01 +01:00
self._check_dropped()
2017-09-02 22:35:29 +02:00
if name not in self.columns:
log.debug("Column does not exist: %s", name)
return
2017-09-02 22:35:29 +02:00
with self.database.lock:
self.database.op.drop_column(
self.table.name,
name,
self.table.schema
)
2017-09-02 23:05:50 +02:00
self.table = self.database._reflect_table(self.table.name)
def has_index(self, columns):
"""Check if an index exists to cover the given `columns`."""
columns = set([normalize_column_name(c) for c in columns])
if columns in self._indexes:
return True
inspector = Inspector.from_engine(self.database.executable)
indexes = inspector.get_indexes(self.name, schema=self.database.schema)
for index in indexes:
if columns == set(index.get('column_names', [])):
self._indexes.append(columns)
return True
return False
2013-12-04 20:44:01 +01:00
def create_index(self, columns, name=None, **kw):
2013-04-03 01:48:26 +02:00
"""
2016-04-10 09:25:14 +02:00
Create an index to speed up queries on a table.
If no ``name`` is given a random name is created.
2013-04-03 01:48:26 +02:00
::
table.create_index(['name', 'country'])
"""
2013-04-05 00:31:13 +02:00
self._check_dropped()
2017-09-02 23:05:50 +02:00
columns = [normalize_column_name(c) for c in columns]
2017-09-02 22:35:29 +02:00
with self.database.lock:
2017-09-02 23:05:50 +02:00
if not self.has_index(columns):
name = name or index_name(self.name, columns)
2017-09-02 22:35:29 +02:00
columns = [self.table.c[c] for c in columns]
idx = Index(name, *columns, **kw)
idx.create(self.database.executable)
def _args_to_order_by(self, order_by):
2017-01-29 15:45:05 +01:00
if not isinstance(order_by, (list, tuple)):
order_by = [order_by]
orderings = []
for ordering in order_by:
if ordering is None:
continue
column = ordering.lstrip('-')
if column not in self.table.columns:
continue
if ordering.startswith('-'):
orderings.append(self.table.c[column].desc())
else:
orderings.append(self.table.c[column].asc())
return orderings
def find(self, *_clauses, **kwargs):
2013-04-03 00:51:33 +02:00
"""
2016-04-10 09:25:14 +02:00
Perform a simple search on the table.
Simply pass keyword arguments as ``filter``.
2013-04-02 11:10:29 +02:00
::
2013-04-03 00:51:33 +02:00
2013-04-02 11:10:29 +02:00
results = table.find(country='France')
results = table.find(country='France', year=1980)
2013-04-02 23:45:44 +02:00
2013-04-03 00:51:33 +02:00
Using ``_limit``::
2013-04-02 11:10:29 +02:00
# just return the first 10 rows
results = table.find(country='France', _limit=10)
2013-04-02 13:44:14 +02:00
2013-04-02 23:45:44 +02:00
You can sort the results by single or multiple columns. Append a minus sign
to the column name for descending order::
2013-04-03 00:51:33 +02:00
# sort results by a column 'year'
2013-04-02 11:10:29 +02:00
results = table.find(country='France', order_by='year')
# return all rows sorted by multiple columns (by year in descending order)
results = table.find(order_by=['country', '-year'])
2013-04-02 11:10:29 +02:00
2013-04-03 22:27:06 +02:00
For more complex queries, please use :py:meth:`db.query() <dataset.Database.query>`
2016-04-10 09:25:14 +02:00
instead.
"""
_limit = kwargs.pop('_limit', None)
_offset = kwargs.pop('_offset', 0)
_step = kwargs.pop('_step', 5000)
2017-01-29 15:45:05 +01:00
order_by = kwargs.pop('order_by', None)
2013-04-05 00:31:13 +02:00
self._check_dropped()
2017-01-29 15:45:05 +01:00
order_by = self._args_to_order_by(order_by)
args = self._args_to_clause(kwargs, ensure=False, clauses=_clauses)
2017-01-29 15:45:05 +01:00
if _step is False or _step == 0:
_step = None
query = self.table.select(whereclause=args, limit=_limit,
2017-01-29 15:45:05 +01:00
offset=_offset)
if len(order_by):
query = query.order_by(*order_by)
return ResultIter(self.database.executable.execute(query),
row_type=self.database.row_type, step=_step)
2017-09-02 22:35:29 +02:00
def find_one(self, *args, **kwargs):
"""Get a single result from the table.
Works just like :py:meth:`find() <dataset.Table.find>` but returns one
result, or None.
::
row = table.find_one(country='United States')
"""
2017-09-02 23:05:50 +02:00
if not self.exists:
return None
2017-09-02 22:35:29 +02:00
kwargs['_limit'] = 1
kwargs['_step'] = None
resiter = self.find(*args, **kwargs)
try:
for row in resiter:
return row
finally:
resiter.close()
2017-01-29 15:45:05 +01:00
def count(self, *_clauses, **kwargs):
2016-04-10 09:25:14 +02:00
"""Return the count of results for the given filter set."""
2017-01-29 15:45:05 +01:00
# NOTE: this does not have support for limit and offset since I can't
# see how this is useful. Still, there might be compatibility issues
# with people using these flags. Let's see how it goes.
2017-09-02 23:05:50 +02:00
if not self.exists:
return 0
2017-01-29 15:45:05 +01:00
self._check_dropped()
args = self._args_to_clause(kwargs, ensure=False, clauses=_clauses)
query = select([func.count()], whereclause=args)
query = query.select_from(self.table)
rp = self.database.executable.execute(query)
return rp.fetchone()[0]
2013-04-01 19:56:14 +02:00
def __len__(self):
2016-04-10 09:25:14 +02:00
"""Return the number of rows in the table."""
return self.count()
2013-04-01 19:56:14 +02:00
def distinct(self, *args, **_filter):
2013-04-03 00:51:33 +02:00
"""
2016-04-10 09:25:14 +02:00
Return all rows of a table, but remove rows in with duplicate values in ``columns``.
2013-04-02 11:10:29 +02:00
Interally this creates a `DISTINCT statement <http://www.w3schools.com/sql/sql_distinct.asp>`_.
::
2013-04-03 00:51:33 +02:00
2013-04-02 11:10:29 +02:00
# returns only one row per year, ignoring the rest
table.distinct('year')
# works with multiple columns, too
table.distinct('year', 'country')
# you can also combine this with a filter
table.distinct('year', country='China')
"""
2013-04-05 00:31:13 +02:00
self._check_dropped()
qargs = []
columns = []
try:
for c in args:
if isinstance(c, ClauseElement):
qargs.append(c)
else:
columns.append(self.table.c[c])
for col, val in _filter.items():
2013-04-02 11:10:29 +02:00
qargs.append(self.table.c[col] == val)
except KeyError:
return []
q = expression.select(columns, distinct=True,
2013-04-02 11:10:29 +02:00
whereclause=and_(*qargs),
order_by=[c.asc() for c in columns])
2013-04-01 19:28:22 +02:00
return self.database.query(q)
def __getitem__(self, item):
2016-04-10 09:25:14 +02:00
"""
Get distinct column values.
This is an alias for distinct which allows the table to be queried as using
square bracket syntax.
::
# Same as distinct:
print list(table['year'])
"""
if not isinstance(item, tuple):
item = item,
return self.distinct(*item)
def all(self):
2013-04-03 00:51:33 +02:00
"""
2016-04-10 09:25:14 +02:00
Return all rows of the table as simple dictionaries.
This is simply a shortcut to *find()* called with no arguments.
2013-04-02 11:10:29 +02:00
::
2013-04-03 00:51:33 +02:00
2016-04-10 09:25:14 +02:00
rows = table.all()
"""
return self.find()
2013-04-01 18:05:41 +02:00
def __iter__(self):
"""
2016-04-10 09:25:14 +02:00
Return all rows of the table as simple dictionaries.
2013-04-03 12:46:10 +02:00
Allows for iterating over all rows in the table without explicetly
calling :py:meth:`all() <dataset.Table.all>`.
::
for row in table:
2014-01-31 20:42:26 +01:00
print(row)
"""
2013-04-12 16:42:22 +02:00
return self.all()
2014-01-31 21:44:39 +01:00
def __repr__(self):
2016-04-10 09:25:14 +02:00
"""Get table representation."""
2014-01-31 21:44:39 +01:00
return '<Table(%s)>' % self.table.name