dataframe.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. # pylint: disable=C,R,W
  2. """ Superset wrapper around pandas.DataFrame.
  3. TODO(bkyryliuk): add support for the conventions like: *_dim or dim_*
  4. dimensions, *_ts, ts_*, ds_*, *_ds - datetime, etc.
  5. TODO(bkyryliuk): recognize integer encoded enums.
  6. """
  7. from datetime import date, datetime
  8. import logging
  9. import numpy as np
  10. import pandas as pd
  11. from pandas.core.common import _maybe_box_datetimelike
  12. from pandas.core.dtypes.dtypes import ExtensionDtype
  13. from past.builtins import basestring
  14. from superset.utils import JS_MAX_INTEGER
  15. INFER_COL_TYPES_THRESHOLD = 95
  16. INFER_COL_TYPES_SAMPLE_SIZE = 100
  17. def dedup(l, suffix='__', case_sensitive=True):
  18. """De-duplicates a list of string by suffixing a counter
  19. Always returns the same number of entries as provided, and always returns
  20. unique values. Case sensitive comparison by default.
  21. >>> print(','.join(dedup(['foo', 'bar', 'bar', 'bar', 'Bar'])))
  22. foo,bar,bar__1,bar__2,Bar
  23. >>> print(','.join(dedup(['foo', 'bar', 'bar', 'bar', 'Bar'], case_sensitive=False)))
  24. foo,bar,bar__1,bar__2,Bar__3
  25. """
  26. new_l = []
  27. seen = {}
  28. for s in l:
  29. s_fixed_case = s if case_sensitive else s.lower()
  30. if s_fixed_case in seen:
  31. seen[s_fixed_case] += 1
  32. s += suffix + str(seen[s_fixed_case])
  33. else:
  34. seen[s_fixed_case] = 0
  35. new_l.append(s)
  36. return new_l
  37. class SupersetDataFrame(object):
  38. # Mapping numpy dtype.char to generic database types
  39. type_map = {
  40. 'b': 'BOOL', # boolean
  41. 'i': 'INT', # (signed) integer
  42. 'u': 'INT', # unsigned integer
  43. 'l': 'INT', # 64bit integer
  44. 'f': 'FLOAT', # floating-point
  45. 'c': 'FLOAT', # complex-floating point
  46. 'm': None, # timedelta
  47. 'M': 'DATETIME', # datetime
  48. 'O': 'OBJECT', # (Python) objects
  49. 'S': 'BYTE', # (byte-)string
  50. 'U': 'STRING', # Unicode
  51. 'V': None, # raw data (void)
  52. }
  53. def __init__(self, data, cursor_description, db_engine_spec):
  54. column_names = []
  55. if cursor_description:
  56. column_names = [col[0] for col in cursor_description]
  57. self.column_names = dedup(column_names)
  58. data = data or []
  59. self.df = (
  60. pd.DataFrame(list(data), columns=self.column_names).infer_objects())
  61. self._type_dict = {}
  62. try:
  63. # The driver may not be passing a cursor.description
  64. self._type_dict = {
  65. col: db_engine_spec.get_datatype(cursor_description[i][1])
  66. for i, col in enumerate(self.column_names)
  67. if cursor_description
  68. }
  69. except Exception as e:
  70. logging.exception(e)
  71. @property
  72. def size(self):
  73. return len(self.df.index)
  74. @property
  75. def data(self):
  76. # work around for https://github.com/pandas-dev/pandas/issues/18372
  77. data = [dict((k, _maybe_box_datetimelike(v))
  78. for k, v in zip(self.df.columns, np.atleast_1d(row)))
  79. for row in self.df.values]
  80. for d in data:
  81. for k, v in list(d.items()):
  82. # if an int is too big for Java Script to handle
  83. # convert it to a string
  84. if isinstance(v, int):
  85. if abs(v) > JS_MAX_INTEGER:
  86. d[k] = str(v)
  87. return data
  88. @classmethod
  89. def db_type(cls, dtype):
  90. """Given a numpy dtype, Returns a generic database type"""
  91. if isinstance(dtype, ExtensionDtype):
  92. return cls.type_map.get(dtype.kind)
  93. elif hasattr(dtype, 'char'):
  94. return cls.type_map.get(dtype.char)
  95. @classmethod
  96. def datetime_conversion_rate(cls, data_series):
  97. success = 0
  98. total = 0
  99. for value in data_series:
  100. total += 1
  101. try:
  102. pd.to_datetime(value)
  103. success += 1
  104. except Exception:
  105. continue
  106. return 100 * success / total
  107. @staticmethod
  108. def is_date(np_dtype, db_type_str):
  109. def looks_daty(s):
  110. if isinstance(s, basestring):
  111. return any([s.lower().startswith(ss) for ss in ('time', 'date')])
  112. return False
  113. if looks_daty(db_type_str):
  114. return True
  115. if np_dtype and np_dtype.name and looks_daty(np_dtype.name):
  116. return True
  117. return False
  118. @classmethod
  119. def is_dimension(cls, dtype, column_name):
  120. if cls.is_id(column_name):
  121. return False
  122. return dtype.name in ('object', 'bool')
  123. @classmethod
  124. def is_id(cls, column_name):
  125. return column_name.startswith('id') or column_name.endswith('id')
  126. @classmethod
  127. def agg_func(cls, dtype, column_name):
  128. # consider checking for key substring too.
  129. if cls.is_id(column_name):
  130. return 'count_distinct'
  131. if (hasattr(dtype, 'type') and issubclass(dtype.type, np.generic) and
  132. np.issubdtype(dtype, np.number)):
  133. return 'sum'
  134. return None
  135. @property
  136. def columns(self):
  137. """Provides metadata about columns for data visualization.
  138. :return: dict, with the fields name, type, is_date, is_dim and agg.
  139. """
  140. if self.df.empty:
  141. return None
  142. columns = []
  143. sample_size = min(INFER_COL_TYPES_SAMPLE_SIZE, len(self.df.index))
  144. sample = self.df
  145. if sample_size:
  146. sample = self.df.sample(sample_size)
  147. for col in self.df.dtypes.keys():
  148. db_type_str = (
  149. self._type_dict.get(col) or
  150. self.db_type(self.df.dtypes[col])
  151. )
  152. column = {
  153. 'name': col,
  154. 'agg': self.agg_func(self.df.dtypes[col], col),
  155. 'type': db_type_str,
  156. 'is_date': self.is_date(self.df.dtypes[col], db_type_str),
  157. 'is_dim': self.is_dimension(self.df.dtypes[col], col),
  158. }
  159. if not db_type_str or db_type_str.upper() == 'OBJECT':
  160. v = sample[col].iloc[0] if not sample[col].empty else None
  161. if isinstance(v, basestring):
  162. column['type'] = 'STRING'
  163. elif isinstance(v, int):
  164. column['type'] = 'INT'
  165. elif isinstance(v, float):
  166. column['type'] = 'FLOAT'
  167. elif isinstance(v, (datetime, date)):
  168. column['type'] = 'DATETIME'
  169. column['is_date'] = True
  170. column['is_dim'] = False
  171. # check if encoded datetime
  172. if (
  173. column['type'] == 'STRING' and
  174. self.datetime_conversion_rate(sample[col]) >
  175. INFER_COL_TYPES_THRESHOLD):
  176. column.update({
  177. 'is_date': True,
  178. 'is_dim': False,
  179. 'agg': None,
  180. })
  181. # 'agg' is optional attribute
  182. if not column['agg']:
  183. column.pop('agg', None)
  184. columns.append(column)
  185. return columns