presto_tests.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438
  1. # Licensed to the Apache Software Foundation (ASF) under one
  2. # or more contributor license agreements. See the NOTICE file
  3. # distributed with this work for additional information
  4. # regarding copyright ownership. The ASF licenses this file
  5. # to you under the Apache License, Version 2.0 (the
  6. # "License"); you may not use this file except in compliance
  7. # with the License. You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing,
  12. # software distributed under the License is distributed on an
  13. # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14. # KIND, either express or implied. See the License for the
  15. # specific language governing permissions and limitations
  16. # under the License.
  17. from unittest import mock, skipUnless
  18. import pandas as pd
  19. from sqlalchemy.engine.result import RowProxy
  20. from sqlalchemy.sql import select
  21. from superset.db_engine_specs.presto import PrestoEngineSpec
  22. from tests.db_engine_specs.base_tests import DbEngineSpecTestCase
  23. class PrestoTests(DbEngineSpecTestCase):
  24. @skipUnless(
  25. DbEngineSpecTestCase.is_module_installed("pyhive"), "pyhive not installed"
  26. )
  27. def test_get_datatype_presto(self):
  28. self.assertEqual("STRING", PrestoEngineSpec.get_datatype("string"))
  29. def test_presto_get_view_names_return_empty_list(
  30. self
  31. ): # pylint: disable=invalid-name
  32. self.assertEqual(
  33. [], PrestoEngineSpec.get_view_names(mock.ANY, mock.ANY, mock.ANY)
  34. )
  35. def verify_presto_column(self, column, expected_results):
  36. inspector = mock.Mock()
  37. inspector.engine.dialect.identifier_preparer.quote_identifier = mock.Mock()
  38. keymap = {
  39. "Column": (None, None, 0),
  40. "Type": (None, None, 1),
  41. "Null": (None, None, 2),
  42. }
  43. row = RowProxy(mock.Mock(), column, [None, None, None, None], keymap)
  44. inspector.bind.execute = mock.Mock(return_value=[row])
  45. results = PrestoEngineSpec.get_columns(inspector, "", "")
  46. self.assertEqual(len(expected_results), len(results))
  47. for expected_result, result in zip(expected_results, results):
  48. self.assertEqual(expected_result[0], result["name"])
  49. self.assertEqual(expected_result[1], str(result["type"]))
  50. def test_presto_get_column(self):
  51. presto_column = ("column_name", "boolean", "")
  52. expected_results = [("column_name", "BOOLEAN")]
  53. self.verify_presto_column(presto_column, expected_results)
  54. @mock.patch.dict(
  55. "superset.extensions.feature_flag_manager._feature_flags",
  56. {"PRESTO_EXPAND_DATA": True},
  57. clear=True,
  58. )
  59. def test_presto_get_simple_row_column(self):
  60. presto_column = ("column_name", "row(nested_obj double)", "")
  61. expected_results = [("column_name", "ROW"), ("column_name.nested_obj", "FLOAT")]
  62. self.verify_presto_column(presto_column, expected_results)
  63. @mock.patch.dict(
  64. "superset.extensions.feature_flag_manager._feature_flags",
  65. {"PRESTO_EXPAND_DATA": True},
  66. clear=True,
  67. )
  68. def test_presto_get_simple_row_column_with_name_containing_whitespace(self):
  69. presto_column = ("column name", "row(nested_obj double)", "")
  70. expected_results = [("column name", "ROW"), ("column name.nested_obj", "FLOAT")]
  71. self.verify_presto_column(presto_column, expected_results)
  72. @mock.patch.dict(
  73. "superset.extensions.feature_flag_manager._feature_flags",
  74. {"PRESTO_EXPAND_DATA": True},
  75. clear=True,
  76. )
  77. def test_presto_get_simple_row_column_with_tricky_nested_field_name(self):
  78. presto_column = ("column_name", 'row("Field Name(Tricky, Name)" double)', "")
  79. expected_results = [
  80. ("column_name", "ROW"),
  81. ('column_name."Field Name(Tricky, Name)"', "FLOAT"),
  82. ]
  83. self.verify_presto_column(presto_column, expected_results)
  84. @mock.patch.dict(
  85. "superset.extensions.feature_flag_manager._feature_flags",
  86. {"PRESTO_EXPAND_DATA": True},
  87. clear=True,
  88. )
  89. def test_presto_get_simple_array_column(self):
  90. presto_column = ("column_name", "array(double)", "")
  91. expected_results = [("column_name", "ARRAY")]
  92. self.verify_presto_column(presto_column, expected_results)
  93. @mock.patch.dict(
  94. "superset.extensions.feature_flag_manager._feature_flags",
  95. {"PRESTO_EXPAND_DATA": True},
  96. clear=True,
  97. )
  98. def test_presto_get_row_within_array_within_row_column(self):
  99. presto_column = (
  100. "column_name",
  101. "row(nested_array array(row(nested_row double)), nested_obj double)",
  102. "",
  103. )
  104. expected_results = [
  105. ("column_name", "ROW"),
  106. ("column_name.nested_array", "ARRAY"),
  107. ("column_name.nested_array.nested_row", "FLOAT"),
  108. ("column_name.nested_obj", "FLOAT"),
  109. ]
  110. self.verify_presto_column(presto_column, expected_results)
  111. @mock.patch.dict(
  112. "superset.extensions.feature_flag_manager._feature_flags",
  113. {"PRESTO_EXPAND_DATA": True},
  114. clear=True,
  115. )
  116. def test_presto_get_array_within_row_within_array_column(self):
  117. presto_column = (
  118. "column_name",
  119. "array(row(nested_array array(double), nested_obj double))",
  120. "",
  121. )
  122. expected_results = [
  123. ("column_name", "ARRAY"),
  124. ("column_name.nested_array", "ARRAY"),
  125. ("column_name.nested_obj", "FLOAT"),
  126. ]
  127. self.verify_presto_column(presto_column, expected_results)
  128. def test_presto_get_fields(self):
  129. cols = [
  130. {"name": "column"},
  131. {"name": "column.nested_obj"},
  132. {"name": 'column."quoted.nested obj"'},
  133. ]
  134. actual_results = PrestoEngineSpec._get_fields(cols)
  135. expected_results = [
  136. {"name": '"column"', "label": "column"},
  137. {"name": '"column"."nested_obj"', "label": "column.nested_obj"},
  138. {
  139. "name": '"column"."quoted.nested obj"',
  140. "label": 'column."quoted.nested obj"',
  141. },
  142. ]
  143. for actual_result, expected_result in zip(actual_results, expected_results):
  144. self.assertEqual(actual_result.element.name, expected_result["name"])
  145. self.assertEqual(actual_result.name, expected_result["label"])
  146. @mock.patch.dict(
  147. "superset.extensions.feature_flag_manager._feature_flags",
  148. {"PRESTO_EXPAND_DATA": True},
  149. clear=True,
  150. )
  151. def test_presto_expand_data_with_simple_structural_columns(self):
  152. cols = [
  153. {"name": "row_column", "type": "ROW(NESTED_OBJ VARCHAR)"},
  154. {"name": "array_column", "type": "ARRAY(BIGINT)"},
  155. ]
  156. data = [
  157. {"row_column": ["a"], "array_column": [1, 2, 3]},
  158. {"row_column": ["b"], "array_column": [4, 5, 6]},
  159. ]
  160. actual_cols, actual_data, actual_expanded_cols = PrestoEngineSpec.expand_data(
  161. cols, data
  162. )
  163. expected_cols = [
  164. {"name": "row_column", "type": "ROW(NESTED_OBJ VARCHAR)"},
  165. {"name": "row_column.nested_obj", "type": "VARCHAR"},
  166. {"name": "array_column", "type": "ARRAY(BIGINT)"},
  167. ]
  168. expected_data = [
  169. {"array_column": 1, "row_column": ["a"], "row_column.nested_obj": "a"},
  170. {"array_column": 2, "row_column": "", "row_column.nested_obj": ""},
  171. {"array_column": 3, "row_column": "", "row_column.nested_obj": ""},
  172. {"array_column": 4, "row_column": ["b"], "row_column.nested_obj": "b"},
  173. {"array_column": 5, "row_column": "", "row_column.nested_obj": ""},
  174. {"array_column": 6, "row_column": "", "row_column.nested_obj": ""},
  175. ]
  176. expected_expanded_cols = [{"name": "row_column.nested_obj", "type": "VARCHAR"}]
  177. self.assertEqual(actual_cols, expected_cols)
  178. self.assertEqual(actual_data, expected_data)
  179. self.assertEqual(actual_expanded_cols, expected_expanded_cols)
  180. @mock.patch.dict(
  181. "superset.extensions.feature_flag_manager._feature_flags",
  182. {"PRESTO_EXPAND_DATA": True},
  183. clear=True,
  184. )
  185. def test_presto_expand_data_with_complex_row_columns(self):
  186. cols = [
  187. {
  188. "name": "row_column",
  189. "type": "ROW(NESTED_OBJ1 VARCHAR, NESTED_ROW ROW(NESTED_OBJ2 VARCHAR))",
  190. }
  191. ]
  192. data = [{"row_column": ["a1", ["a2"]]}, {"row_column": ["b1", ["b2"]]}]
  193. actual_cols, actual_data, actual_expanded_cols = PrestoEngineSpec.expand_data(
  194. cols, data
  195. )
  196. expected_cols = [
  197. {
  198. "name": "row_column",
  199. "type": "ROW(NESTED_OBJ1 VARCHAR, NESTED_ROW ROW(NESTED_OBJ2 VARCHAR))",
  200. },
  201. {"name": "row_column.nested_row", "type": "ROW(NESTED_OBJ2 VARCHAR)"},
  202. {"name": "row_column.nested_row.nested_obj2", "type": "VARCHAR"},
  203. {"name": "row_column.nested_obj1", "type": "VARCHAR"},
  204. ]
  205. expected_data = [
  206. {
  207. "row_column": ["a1", ["a2"]],
  208. "row_column.nested_obj1": "a1",
  209. "row_column.nested_row": ["a2"],
  210. "row_column.nested_row.nested_obj2": "a2",
  211. },
  212. {
  213. "row_column": ["b1", ["b2"]],
  214. "row_column.nested_obj1": "b1",
  215. "row_column.nested_row": ["b2"],
  216. "row_column.nested_row.nested_obj2": "b2",
  217. },
  218. ]
  219. expected_expanded_cols = [
  220. {"name": "row_column.nested_obj1", "type": "VARCHAR"},
  221. {"name": "row_column.nested_row", "type": "ROW(NESTED_OBJ2 VARCHAR)"},
  222. {"name": "row_column.nested_row.nested_obj2", "type": "VARCHAR"},
  223. ]
  224. self.assertEqual(actual_cols, expected_cols)
  225. self.assertEqual(actual_data, expected_data)
  226. self.assertEqual(actual_expanded_cols, expected_expanded_cols)
  227. @mock.patch.dict(
  228. "superset.extensions.feature_flag_manager._feature_flags",
  229. {"PRESTO_EXPAND_DATA": True},
  230. clear=True,
  231. )
  232. def test_presto_expand_data_with_complex_array_columns(self):
  233. cols = [
  234. {"name": "int_column", "type": "BIGINT"},
  235. {
  236. "name": "array_column",
  237. "type": "ARRAY(ROW(NESTED_ARRAY ARRAY(ROW(NESTED_OBJ VARCHAR))))",
  238. },
  239. ]
  240. data = [
  241. {"int_column": 1, "array_column": [[[["a"], ["b"]]], [[["c"], ["d"]]]]},
  242. {"int_column": 2, "array_column": [[[["e"], ["f"]]], [[["g"], ["h"]]]]},
  243. ]
  244. actual_cols, actual_data, actual_expanded_cols = PrestoEngineSpec.expand_data(
  245. cols, data
  246. )
  247. expected_cols = [
  248. {"name": "int_column", "type": "BIGINT"},
  249. {
  250. "name": "array_column",
  251. "type": "ARRAY(ROW(NESTED_ARRAY ARRAY(ROW(NESTED_OBJ VARCHAR))))",
  252. },
  253. {
  254. "name": "array_column.nested_array",
  255. "type": "ARRAY(ROW(NESTED_OBJ VARCHAR))",
  256. },
  257. {"name": "array_column.nested_array.nested_obj", "type": "VARCHAR"},
  258. ]
  259. expected_data = [
  260. {
  261. "array_column": [[["a"], ["b"]]],
  262. "array_column.nested_array": ["a"],
  263. "array_column.nested_array.nested_obj": "a",
  264. "int_column": 1,
  265. },
  266. {
  267. "array_column": "",
  268. "array_column.nested_array": ["b"],
  269. "array_column.nested_array.nested_obj": "b",
  270. "int_column": "",
  271. },
  272. {
  273. "array_column": [[["c"], ["d"]]],
  274. "array_column.nested_array": ["c"],
  275. "array_column.nested_array.nested_obj": "c",
  276. "int_column": "",
  277. },
  278. {
  279. "array_column": "",
  280. "array_column.nested_array": ["d"],
  281. "array_column.nested_array.nested_obj": "d",
  282. "int_column": "",
  283. },
  284. {
  285. "array_column": [[["e"], ["f"]]],
  286. "array_column.nested_array": ["e"],
  287. "array_column.nested_array.nested_obj": "e",
  288. "int_column": 2,
  289. },
  290. {
  291. "array_column": "",
  292. "array_column.nested_array": ["f"],
  293. "array_column.nested_array.nested_obj": "f",
  294. "int_column": "",
  295. },
  296. {
  297. "array_column": [[["g"], ["h"]]],
  298. "array_column.nested_array": ["g"],
  299. "array_column.nested_array.nested_obj": "g",
  300. "int_column": "",
  301. },
  302. {
  303. "array_column": "",
  304. "array_column.nested_array": ["h"],
  305. "array_column.nested_array.nested_obj": "h",
  306. "int_column": "",
  307. },
  308. ]
  309. expected_expanded_cols = [
  310. {
  311. "name": "array_column.nested_array",
  312. "type": "ARRAY(ROW(NESTED_OBJ VARCHAR))",
  313. },
  314. {"name": "array_column.nested_array.nested_obj", "type": "VARCHAR"},
  315. ]
  316. self.assertEqual(actual_cols, expected_cols)
  317. self.assertEqual(actual_data, expected_data)
  318. self.assertEqual(actual_expanded_cols, expected_expanded_cols)
  319. def test_presto_extra_table_metadata(self):
  320. db = mock.Mock()
  321. db.get_indexes = mock.Mock(return_value=[{"column_names": ["ds", "hour"]}])
  322. db.get_extra = mock.Mock(return_value={})
  323. df = pd.DataFrame({"ds": ["01-01-19"], "hour": [1]})
  324. db.get_df = mock.Mock(return_value=df)
  325. PrestoEngineSpec.get_create_view = mock.Mock(return_value=None)
  326. result = PrestoEngineSpec.extra_table_metadata(db, "test_table", "test_schema")
  327. self.assertEqual({"ds": "01-01-19", "hour": 1}, result["partitions"]["latest"])
  328. def test_presto_where_latest_partition(self):
  329. db = mock.Mock()
  330. db.get_indexes = mock.Mock(return_value=[{"column_names": ["ds", "hour"]}])
  331. db.get_extra = mock.Mock(return_value={})
  332. df = pd.DataFrame({"ds": ["01-01-19"], "hour": [1]})
  333. db.get_df = mock.Mock(return_value=df)
  334. columns = [{"name": "ds"}, {"name": "hour"}]
  335. result = PrestoEngineSpec.where_latest_partition(
  336. "test_table", "test_schema", db, select(), columns
  337. )
  338. query_result = str(result.compile(compile_kwargs={"literal_binds": True}))
  339. self.assertEqual("SELECT \nWHERE ds = '01-01-19' AND hour = 1", query_result)
  340. def test_convert_dttm(self):
  341. dttm = self.get_dttm()
  342. self.assertEqual(
  343. PrestoEngineSpec.convert_dttm("DATE", dttm),
  344. "from_iso8601_date('2019-01-02')",
  345. )
  346. self.assertEqual(
  347. PrestoEngineSpec.convert_dttm("TIMESTAMP", dttm),
  348. "from_iso8601_timestamp('2019-01-02T03:04:05.678900')",
  349. )
  350. def test_query_cost_formatter(self):
  351. raw_cost = [
  352. {
  353. "inputTableColumnInfos": [
  354. {
  355. "table": {
  356. "catalog": "hive",
  357. "schemaTable": {
  358. "schema": "default",
  359. "table": "fact_passenger_state",
  360. },
  361. },
  362. "columnConstraints": [
  363. {
  364. "columnName": "ds",
  365. "typeSignature": "varchar",
  366. "domain": {
  367. "nullsAllowed": False,
  368. "ranges": [
  369. {
  370. "low": {
  371. "value": "2019-07-10",
  372. "bound": "EXACTLY",
  373. },
  374. "high": {
  375. "value": "2019-07-10",
  376. "bound": "EXACTLY",
  377. },
  378. }
  379. ],
  380. },
  381. }
  382. ],
  383. "estimate": {
  384. "outputRowCount": 9.04969899e8,
  385. "outputSizeInBytes": 3.54143678301e11,
  386. "cpuCost": 3.54143678301e11,
  387. "maxMemory": 0.0,
  388. "networkCost": 0.0,
  389. },
  390. }
  391. ],
  392. "estimate": {
  393. "outputRowCount": 9.04969899e8,
  394. "outputSizeInBytes": 3.54143678301e11,
  395. "cpuCost": 3.54143678301e11,
  396. "maxMemory": 0.0,
  397. "networkCost": 3.54143678301e11,
  398. },
  399. }
  400. ]
  401. formatted_cost = PrestoEngineSpec.query_cost_formatter(raw_cost)
  402. expected = [
  403. {
  404. "Output count": "904 M rows",
  405. "Output size": "354 GB",
  406. "CPU cost": "354 G",
  407. "Max memory": "0 B",
  408. "Network cost": "354 G",
  409. }
  410. ]
  411. self.assertEqual(formatted_cost, expected)