123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215 |
- # pylint: disable=C,R,W
- """ Superset wrapper around pandas.DataFrame.
- TODO(bkyryliuk): add support for the conventions like: *_dim or dim_*
- dimensions, *_ts, ts_*, ds_*, *_ds - datetime, etc.
- TODO(bkyryliuk): recognize integer encoded enums.
- """
- from datetime import date, datetime
- import logging
- import numpy as np
- import pandas as pd
- from pandas.core.common import _maybe_box_datetimelike
- from pandas.core.dtypes.dtypes import ExtensionDtype
- from past.builtins import basestring
- from superset.utils import JS_MAX_INTEGER
- INFER_COL_TYPES_THRESHOLD = 95
- INFER_COL_TYPES_SAMPLE_SIZE = 100
- def dedup(l, suffix='__', case_sensitive=True):
- """De-duplicates a list of string by suffixing a counter
- Always returns the same number of entries as provided, and always returns
- unique values. Case sensitive comparison by default.
- >>> print(','.join(dedup(['foo', 'bar', 'bar', 'bar', 'Bar'])))
- foo,bar,bar__1,bar__2,Bar
- >>> print(','.join(dedup(['foo', 'bar', 'bar', 'bar', 'Bar'], case_sensitive=False)))
- foo,bar,bar__1,bar__2,Bar__3
- """
- new_l = []
- seen = {}
- for s in l:
- s_fixed_case = s if case_sensitive else s.lower()
- if s_fixed_case in seen:
- seen[s_fixed_case] += 1
- s += suffix + str(seen[s_fixed_case])
- else:
- seen[s_fixed_case] = 0
- new_l.append(s)
- return new_l
- class SupersetDataFrame(object):
- # Mapping numpy dtype.char to generic database types
- type_map = {
- 'b': 'BOOL', # boolean
- 'i': 'INT', # (signed) integer
- 'u': 'INT', # unsigned integer
- 'l': 'INT', # 64bit integer
- 'f': 'FLOAT', # floating-point
- 'c': 'FLOAT', # complex-floating point
- 'm': None, # timedelta
- 'M': 'DATETIME', # datetime
- 'O': 'OBJECT', # (Python) objects
- 'S': 'BYTE', # (byte-)string
- 'U': 'STRING', # Unicode
- 'V': None, # raw data (void)
- }
- def __init__(self, data, cursor_description, db_engine_spec):
- column_names = []
- if cursor_description:
- column_names = [col[0] for col in cursor_description]
- self.column_names = dedup(column_names)
- data = data or []
- self.df = (
- pd.DataFrame(list(data), columns=self.column_names).infer_objects())
- self._type_dict = {}
- try:
- # The driver may not be passing a cursor.description
- self._type_dict = {
- col: db_engine_spec.get_datatype(cursor_description[i][1])
- for i, col in enumerate(self.column_names)
- if cursor_description
- }
- except Exception as e:
- logging.exception(e)
- @property
- def size(self):
- return len(self.df.index)
- @property
- def data(self):
- # work around for https://github.com/pandas-dev/pandas/issues/18372
- data = [dict((k, _maybe_box_datetimelike(v))
- for k, v in zip(self.df.columns, np.atleast_1d(row)))
- for row in self.df.values]
- for d in data:
- for k, v in list(d.items()):
- # if an int is too big for Java Script to handle
- # convert it to a string
- if isinstance(v, int):
- if abs(v) > JS_MAX_INTEGER:
- d[k] = str(v)
- return data
- @classmethod
- def db_type(cls, dtype):
- """Given a numpy dtype, Returns a generic database type"""
- if isinstance(dtype, ExtensionDtype):
- return cls.type_map.get(dtype.kind)
- elif hasattr(dtype, 'char'):
- return cls.type_map.get(dtype.char)
- @classmethod
- def datetime_conversion_rate(cls, data_series):
- success = 0
- total = 0
- for value in data_series:
- total += 1
- try:
- pd.to_datetime(value)
- success += 1
- except Exception:
- continue
- return 100 * success / total
- @staticmethod
- def is_date(np_dtype, db_type_str):
- def looks_daty(s):
- if isinstance(s, basestring):
- return any([s.lower().startswith(ss) for ss in ('time', 'date')])
- return False
- if looks_daty(db_type_str):
- return True
- if np_dtype and np_dtype.name and looks_daty(np_dtype.name):
- return True
- return False
- @classmethod
- def is_dimension(cls, dtype, column_name):
- if cls.is_id(column_name):
- return False
- return dtype.name in ('object', 'bool')
- @classmethod
- def is_id(cls, column_name):
- return column_name.startswith('id') or column_name.endswith('id')
- @classmethod
- def agg_func(cls, dtype, column_name):
- # consider checking for key substring too.
- if cls.is_id(column_name):
- return 'count_distinct'
- if (hasattr(dtype, 'type') and issubclass(dtype.type, np.generic) and
- np.issubdtype(dtype, np.number)):
- return 'sum'
- return None
- @property
- def columns(self):
- """Provides metadata about columns for data visualization.
- :return: dict, with the fields name, type, is_date, is_dim and agg.
- """
- if self.df.empty:
- return None
- columns = []
- sample_size = min(INFER_COL_TYPES_SAMPLE_SIZE, len(self.df.index))
- sample = self.df
- if sample_size:
- sample = self.df.sample(sample_size)
- for col in self.df.dtypes.keys():
- db_type_str = (
- self._type_dict.get(col) or
- self.db_type(self.df.dtypes[col])
- )
- column = {
- 'name': col,
- 'agg': self.agg_func(self.df.dtypes[col], col),
- 'type': db_type_str,
- 'is_date': self.is_date(self.df.dtypes[col], db_type_str),
- 'is_dim': self.is_dimension(self.df.dtypes[col], col),
- }
- if not db_type_str or db_type_str.upper() == 'OBJECT':
- v = sample[col].iloc[0] if not sample[col].empty else None
- if isinstance(v, basestring):
- column['type'] = 'STRING'
- elif isinstance(v, int):
- column['type'] = 'INT'
- elif isinstance(v, float):
- column['type'] = 'FLOAT'
- elif isinstance(v, (datetime, date)):
- column['type'] = 'DATETIME'
- column['is_date'] = True
- column['is_dim'] = False
- # check if encoded datetime
- if (
- column['type'] == 'STRING' and
- self.datetime_conversion_rate(sample[col]) >
- INFER_COL_TYPES_THRESHOLD):
- column.update({
- 'is_date': True,
- 'is_dim': False,
- 'agg': None,
- })
- # 'agg' is optional attribute
- if not column['agg']:
- column.pop('agg', None)
- columns.append(column)
- return columns
|