birth_names.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742
  1. # Licensed to the Apache Software Foundation (ASF) under one
  2. # or more contributor license agreements. See the NOTICE file
  3. # distributed with this work for additional information
  4. # regarding copyright ownership. The ASF licenses this file
  5. # to you under the Apache License, Version 2.0 (the
  6. # "License"); you may not use this file except in compliance
  7. # with the License. You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing,
  12. # software distributed under the License is distributed on an
  13. # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14. # KIND, either express or implied. See the License for the
  15. # specific language governing permissions and limitations
  16. # under the License.
  17. import json
  18. import textwrap
  19. import pandas as pd
  20. from sqlalchemy import DateTime, String
  21. from sqlalchemy.sql import column
  22. from superset import db, security_manager
  23. from superset.connectors.sqla.models import SqlMetric, TableColumn
  24. from superset.models.dashboard import Dashboard
  25. from superset.models.slice import Slice
  26. from superset.utils.core import get_example_database
  27. from .helpers import (
  28. config,
  29. get_example_data,
  30. get_slice_json,
  31. merge_slice,
  32. misc_dash_slices,
  33. TBL,
  34. update_slice_ids,
  35. )
  36. def gen_filter(subject, comparator, operator="=="):
  37. return {
  38. "clause": "WHERE",
  39. "comparator": comparator,
  40. "expressionType": "SIMPLE",
  41. "operator": operator,
  42. "subject": subject,
  43. "fromFormData": True,
  44. }
  45. def load_data(tbl_name, database):
  46. pdf = pd.read_json(get_example_data("birth_names.json.gz"))
  47. pdf.ds = pd.to_datetime(pdf.ds, unit="ms")
  48. pdf.to_sql(
  49. tbl_name,
  50. database.get_sqla_engine(),
  51. if_exists="replace",
  52. chunksize=500,
  53. dtype={
  54. "ds": DateTime,
  55. "gender": String(16),
  56. "state": String(10),
  57. "name": String(255),
  58. },
  59. index=False,
  60. )
  61. print("Done loading table!")
  62. print("-" * 80)
  63. def load_birth_names(only_metadata=False, force=False):
  64. """Loading birth name dataset from a zip file in the repo"""
  65. # pylint: disable=too-many-locals
  66. tbl_name = "birth_names"
  67. database = get_example_database()
  68. table_exists = database.has_table_by_name(tbl_name)
  69. if not only_metadata and (not table_exists or force):
  70. load_data(tbl_name, database)
  71. obj = db.session.query(TBL).filter_by(table_name=tbl_name).first()
  72. if not obj:
  73. print(f"Creating table [{tbl_name}] reference")
  74. obj = TBL(table_name=tbl_name)
  75. db.session.add(obj)
  76. obj.main_dttm_col = "ds"
  77. obj.database = database
  78. obj.filter_select_enabled = True
  79. if not any(col.column_name == "num_california" for col in obj.columns):
  80. col_state = str(column("state").compile(db.engine))
  81. col_num = str(column("num").compile(db.engine))
  82. obj.columns.append(
  83. TableColumn(
  84. column_name="num_california",
  85. expression=f"CASE WHEN {col_state} = 'CA' THEN {col_num} ELSE 0 END",
  86. )
  87. )
  88. if not any(col.metric_name == "sum__num" for col in obj.metrics):
  89. col = str(column("num").compile(db.engine))
  90. obj.metrics.append(SqlMetric(metric_name="sum__num", expression=f"SUM({col})"))
  91. db.session.commit()
  92. obj.fetch_metadata()
  93. tbl = obj
  94. defaults = {
  95. "compare_lag": "10",
  96. "compare_suffix": "o10Y",
  97. "limit": "25",
  98. "granularity_sqla": "ds",
  99. "groupby": [],
  100. "metric": "sum__num",
  101. "metrics": [
  102. {
  103. "expressionType": "SIMPLE",
  104. "column": {"column_name": "num", "type": "BIGINT"},
  105. "aggregate": "SUM",
  106. "label": "Births",
  107. "optionName": "metric_11",
  108. }
  109. ],
  110. "row_limit": config["ROW_LIMIT"],
  111. "since": "100 years ago",
  112. "until": "now",
  113. "viz_type": "table",
  114. "markup_type": "markdown",
  115. }
  116. admin = security_manager.find_user("admin")
  117. print("Creating some slices")
  118. slices = [
  119. Slice(
  120. slice_name="Participants",
  121. viz_type="big_number",
  122. datasource_type="table",
  123. datasource_id=tbl.id,
  124. params=get_slice_json(
  125. defaults,
  126. viz_type="big_number",
  127. granularity_sqla="ds",
  128. compare_lag="5",
  129. compare_suffix="over 5Y",
  130. ),
  131. ),
  132. Slice(
  133. slice_name="Genders",
  134. viz_type="pie",
  135. datasource_type="table",
  136. datasource_id=tbl.id,
  137. params=get_slice_json(defaults, viz_type="pie", groupby=["gender"]),
  138. ),
  139. Slice(
  140. slice_name="Trends",
  141. viz_type="line",
  142. datasource_type="table",
  143. datasource_id=tbl.id,
  144. params=get_slice_json(
  145. defaults,
  146. viz_type="line",
  147. groupby=["name"],
  148. granularity_sqla="ds",
  149. rich_tooltip=True,
  150. show_legend=True,
  151. ),
  152. ),
  153. Slice(
  154. slice_name="Genders by State",
  155. viz_type="dist_bar",
  156. datasource_type="table",
  157. datasource_id=tbl.id,
  158. params=get_slice_json(
  159. defaults,
  160. adhoc_filters=[
  161. {
  162. "clause": "WHERE",
  163. "expressionType": "SIMPLE",
  164. "filterOptionName": "2745eae5",
  165. "comparator": ["other"],
  166. "operator": "not in",
  167. "subject": "state",
  168. }
  169. ],
  170. viz_type="dist_bar",
  171. metrics=[
  172. {
  173. "expressionType": "SIMPLE",
  174. "column": {"column_name": "sum_boys", "type": "BIGINT(20)"},
  175. "aggregate": "SUM",
  176. "label": "Boys",
  177. "optionName": "metric_11",
  178. },
  179. {
  180. "expressionType": "SIMPLE",
  181. "column": {"column_name": "sum_girls", "type": "BIGINT(20)"},
  182. "aggregate": "SUM",
  183. "label": "Girls",
  184. "optionName": "metric_12",
  185. },
  186. ],
  187. groupby=["state"],
  188. ),
  189. ),
  190. Slice(
  191. slice_name="Girls",
  192. viz_type="table",
  193. datasource_type="table",
  194. datasource_id=tbl.id,
  195. params=get_slice_json(
  196. defaults,
  197. groupby=["name"],
  198. adhoc_filters=[gen_filter("gender", "girl")],
  199. row_limit=50,
  200. timeseries_limit_metric="sum__num",
  201. ),
  202. ),
  203. Slice(
  204. slice_name="Girl Name Cloud",
  205. viz_type="word_cloud",
  206. datasource_type="table",
  207. datasource_id=tbl.id,
  208. params=get_slice_json(
  209. defaults,
  210. viz_type="word_cloud",
  211. size_from="10",
  212. series="name",
  213. size_to="70",
  214. rotation="square",
  215. limit="100",
  216. adhoc_filters=[gen_filter("gender", "girl")],
  217. ),
  218. ),
  219. Slice(
  220. slice_name="Boys",
  221. viz_type="table",
  222. datasource_type="table",
  223. datasource_id=tbl.id,
  224. params=get_slice_json(
  225. defaults,
  226. groupby=["name"],
  227. adhoc_filters=[gen_filter("gender", "boy")],
  228. row_limit=50,
  229. ),
  230. ),
  231. Slice(
  232. slice_name="Boy Name Cloud",
  233. viz_type="word_cloud",
  234. datasource_type="table",
  235. datasource_id=tbl.id,
  236. params=get_slice_json(
  237. defaults,
  238. viz_type="word_cloud",
  239. size_from="10",
  240. series="name",
  241. size_to="70",
  242. rotation="square",
  243. limit="100",
  244. adhoc_filters=[gen_filter("gender", "boy")],
  245. ),
  246. ),
  247. Slice(
  248. slice_name="Top 10 Girl Name Share",
  249. viz_type="area",
  250. datasource_type="table",
  251. datasource_id=tbl.id,
  252. params=get_slice_json(
  253. defaults,
  254. adhoc_filters=[gen_filter("gender", "girl")],
  255. comparison_type="values",
  256. groupby=["name"],
  257. limit=10,
  258. stacked_style="expand",
  259. time_grain_sqla="P1D",
  260. viz_type="area",
  261. x_axis_forma="smart_date",
  262. ),
  263. ),
  264. Slice(
  265. slice_name="Top 10 Boy Name Share",
  266. viz_type="area",
  267. datasource_type="table",
  268. datasource_id=tbl.id,
  269. params=get_slice_json(
  270. defaults,
  271. adhoc_filters=[gen_filter("gender", "boy")],
  272. comparison_type="values",
  273. groupby=["name"],
  274. limit=10,
  275. stacked_style="expand",
  276. time_grain_sqla="P1D",
  277. viz_type="area",
  278. x_axis_forma="smart_date",
  279. ),
  280. ),
  281. ]
  282. misc_slices = [
  283. Slice(
  284. slice_name="Average and Sum Trends",
  285. viz_type="dual_line",
  286. datasource_type="table",
  287. datasource_id=tbl.id,
  288. params=get_slice_json(
  289. defaults,
  290. viz_type="dual_line",
  291. metric={
  292. "expressionType": "SIMPLE",
  293. "column": {"column_name": "num", "type": "BIGINT(20)"},
  294. "aggregate": "AVG",
  295. "label": "AVG(num)",
  296. "optionName": "metric_vgops097wej_g8uff99zhk7",
  297. },
  298. metric_2="sum__num",
  299. granularity_sqla="ds",
  300. ),
  301. ),
  302. Slice(
  303. slice_name="Num Births Trend",
  304. viz_type="line",
  305. datasource_type="table",
  306. datasource_id=tbl.id,
  307. params=get_slice_json(defaults, viz_type="line"),
  308. ),
  309. Slice(
  310. slice_name="Daily Totals",
  311. viz_type="table",
  312. datasource_type="table",
  313. datasource_id=tbl.id,
  314. created_by=admin,
  315. params=get_slice_json(
  316. defaults,
  317. groupby=["ds"],
  318. since="40 years ago",
  319. until="now",
  320. viz_type="table",
  321. ),
  322. ),
  323. Slice(
  324. slice_name="Number of California Births",
  325. viz_type="big_number_total",
  326. datasource_type="table",
  327. datasource_id=tbl.id,
  328. params=get_slice_json(
  329. defaults,
  330. metric={
  331. "expressionType": "SIMPLE",
  332. "column": {
  333. "column_name": "num_california",
  334. "expression": "CASE WHEN state = 'CA' THEN num ELSE 0 END",
  335. },
  336. "aggregate": "SUM",
  337. "label": "SUM(num_california)",
  338. },
  339. viz_type="big_number_total",
  340. granularity_sqla="ds",
  341. ),
  342. ),
  343. Slice(
  344. slice_name="Top 10 California Names Timeseries",
  345. viz_type="line",
  346. datasource_type="table",
  347. datasource_id=tbl.id,
  348. params=get_slice_json(
  349. defaults,
  350. metrics=[
  351. {
  352. "expressionType": "SIMPLE",
  353. "column": {
  354. "column_name": "num_california",
  355. "expression": "CASE WHEN state = 'CA' THEN num ELSE 0 END",
  356. },
  357. "aggregate": "SUM",
  358. "label": "SUM(num_california)",
  359. }
  360. ],
  361. viz_type="line",
  362. granularity_sqla="ds",
  363. groupby=["name"],
  364. timeseries_limit_metric={
  365. "expressionType": "SIMPLE",
  366. "column": {
  367. "column_name": "num_california",
  368. "expression": "CASE WHEN state = 'CA' THEN num ELSE 0 END",
  369. },
  370. "aggregate": "SUM",
  371. "label": "SUM(num_california)",
  372. },
  373. limit="10",
  374. ),
  375. ),
  376. Slice(
  377. slice_name="Names Sorted by Num in California",
  378. viz_type="table",
  379. datasource_type="table",
  380. datasource_id=tbl.id,
  381. params=get_slice_json(
  382. defaults,
  383. groupby=["name"],
  384. row_limit=50,
  385. timeseries_limit_metric={
  386. "expressionType": "SIMPLE",
  387. "column": {
  388. "column_name": "num_california",
  389. "expression": "CASE WHEN state = 'CA' THEN num ELSE 0 END",
  390. },
  391. "aggregate": "SUM",
  392. "label": "SUM(num_california)",
  393. },
  394. ),
  395. ),
  396. Slice(
  397. slice_name="Number of Girls",
  398. viz_type="big_number_total",
  399. datasource_type="table",
  400. datasource_id=tbl.id,
  401. params=get_slice_json(
  402. defaults,
  403. viz_type="big_number_total",
  404. granularity_sqla="ds",
  405. adhoc_filters=[gen_filter("gender", "girl")],
  406. subheader="total female participants",
  407. ),
  408. ),
  409. Slice(
  410. slice_name="Pivot Table",
  411. viz_type="pivot_table",
  412. datasource_type="table",
  413. datasource_id=tbl.id,
  414. params=get_slice_json(
  415. defaults, viz_type="pivot_table", groupby=["name"], columns=["state"]
  416. ),
  417. ),
  418. ]
  419. for slc in slices:
  420. merge_slice(slc)
  421. for slc in misc_slices:
  422. merge_slice(slc)
  423. misc_dash_slices.add(slc.slice_name)
  424. print("Creating a dashboard")
  425. dash = db.session.query(Dashboard).filter_by(slug="births").first()
  426. if not dash:
  427. dash = Dashboard()
  428. db.session.add(dash)
  429. dash.published = True
  430. dash.json_metadata = textwrap.dedent(
  431. """\
  432. {
  433. "label_colors": {
  434. "Girls": "#FF69B4",
  435. "Boys": "#ADD8E6",
  436. "girl": "#FF69B4",
  437. "boy": "#ADD8E6"
  438. }
  439. }"""
  440. )
  441. js = textwrap.dedent(
  442. # pylint: disable=line-too-long
  443. """\
  444. {
  445. "CHART-6GdlekVise": {
  446. "children": [],
  447. "id": "CHART-6GdlekVise",
  448. "meta": {
  449. "chartId": 5547,
  450. "height": 50,
  451. "sliceName": "Top 10 Girl Name Share",
  452. "width": 5
  453. },
  454. "parents": [
  455. "ROOT_ID",
  456. "GRID_ID",
  457. "ROW-eh0w37bWbR"
  458. ],
  459. "type": "CHART"
  460. },
  461. "CHART-6n9jxb30JG": {
  462. "children": [],
  463. "id": "CHART-6n9jxb30JG",
  464. "meta": {
  465. "chartId": 5540,
  466. "height": 36,
  467. "sliceName": "Genders by State",
  468. "width": 5
  469. },
  470. "parents": [
  471. "ROOT_ID",
  472. "GRID_ID",
  473. "ROW--EyBZQlDi"
  474. ],
  475. "type": "CHART"
  476. },
  477. "CHART-Jj9qh1ol-N": {
  478. "children": [],
  479. "id": "CHART-Jj9qh1ol-N",
  480. "meta": {
  481. "chartId": 5545,
  482. "height": 50,
  483. "sliceName": "Boy Name Cloud",
  484. "width": 4
  485. },
  486. "parents": [
  487. "ROOT_ID",
  488. "GRID_ID",
  489. "ROW-kzWtcvo8R1"
  490. ],
  491. "type": "CHART"
  492. },
  493. "CHART-ODvantb_bF": {
  494. "children": [],
  495. "id": "CHART-ODvantb_bF",
  496. "meta": {
  497. "chartId": 5548,
  498. "height": 50,
  499. "sliceName": "Top 10 Boy Name Share",
  500. "width": 5
  501. },
  502. "parents": [
  503. "ROOT_ID",
  504. "GRID_ID",
  505. "ROW-kzWtcvo8R1"
  506. ],
  507. "type": "CHART"
  508. },
  509. "CHART-PAXUUqwmX9": {
  510. "children": [],
  511. "id": "CHART-PAXUUqwmX9",
  512. "meta": {
  513. "chartId": 5538,
  514. "height": 34,
  515. "sliceName": "Genders",
  516. "width": 3
  517. },
  518. "parents": [
  519. "ROOT_ID",
  520. "GRID_ID",
  521. "ROW-2n0XgiHDgs"
  522. ],
  523. "type": "CHART"
  524. },
  525. "CHART-_T6n_K9iQN": {
  526. "children": [],
  527. "id": "CHART-_T6n_K9iQN",
  528. "meta": {
  529. "chartId": 5539,
  530. "height": 36,
  531. "sliceName": "Trends",
  532. "width": 7
  533. },
  534. "parents": [
  535. "ROOT_ID",
  536. "GRID_ID",
  537. "ROW--EyBZQlDi"
  538. ],
  539. "type": "CHART"
  540. },
  541. "CHART-eNY0tcE_ic": {
  542. "children": [],
  543. "id": "CHART-eNY0tcE_ic",
  544. "meta": {
  545. "chartId": 5537,
  546. "height": 34,
  547. "sliceName": "Participants",
  548. "width": 3
  549. },
  550. "parents": [
  551. "ROOT_ID",
  552. "GRID_ID",
  553. "ROW-2n0XgiHDgs"
  554. ],
  555. "type": "CHART"
  556. },
  557. "CHART-g075mMgyYb": {
  558. "children": [],
  559. "id": "CHART-g075mMgyYb",
  560. "meta": {
  561. "chartId": 5541,
  562. "height": 50,
  563. "sliceName": "Girls",
  564. "width": 3
  565. },
  566. "parents": [
  567. "ROOT_ID",
  568. "GRID_ID",
  569. "ROW-eh0w37bWbR"
  570. ],
  571. "type": "CHART"
  572. },
  573. "CHART-n-zGGE6S1y": {
  574. "children": [],
  575. "id": "CHART-n-zGGE6S1y",
  576. "meta": {
  577. "chartId": 5542,
  578. "height": 50,
  579. "sliceName": "Girl Name Cloud",
  580. "width": 4
  581. },
  582. "parents": [
  583. "ROOT_ID",
  584. "GRID_ID",
  585. "ROW-eh0w37bWbR"
  586. ],
  587. "type": "CHART"
  588. },
  589. "CHART-vJIPjmcbD3": {
  590. "children": [],
  591. "id": "CHART-vJIPjmcbD3",
  592. "meta": {
  593. "chartId": 5543,
  594. "height": 50,
  595. "sliceName": "Boys",
  596. "width": 3
  597. },
  598. "parents": [
  599. "ROOT_ID",
  600. "GRID_ID",
  601. "ROW-kzWtcvo8R1"
  602. ],
  603. "type": "CHART"
  604. },
  605. "DASHBOARD_VERSION_KEY": "v2",
  606. "GRID_ID": {
  607. "children": [
  608. "ROW-2n0XgiHDgs",
  609. "ROW--EyBZQlDi",
  610. "ROW-eh0w37bWbR",
  611. "ROW-kzWtcvo8R1"
  612. ],
  613. "id": "GRID_ID",
  614. "parents": [
  615. "ROOT_ID"
  616. ],
  617. "type": "GRID"
  618. },
  619. "HEADER_ID": {
  620. "id": "HEADER_ID",
  621. "meta": {
  622. "text": "Births"
  623. },
  624. "type": "HEADER"
  625. },
  626. "MARKDOWN-zaflB60tbC": {
  627. "children": [],
  628. "id": "MARKDOWN-zaflB60tbC",
  629. "meta": {
  630. "code": "<div style=\\"text-align:center\\"> <h1>Birth Names Dashboard</h1> <img src=\\"/static/assets/images/babies.png\\" style=\\"width:50%;\\"></div>",
  631. "height": 34,
  632. "width": 6
  633. },
  634. "parents": [
  635. "ROOT_ID",
  636. "GRID_ID",
  637. "ROW-2n0XgiHDgs"
  638. ],
  639. "type": "MARKDOWN"
  640. },
  641. "ROOT_ID": {
  642. "children": [
  643. "GRID_ID"
  644. ],
  645. "id": "ROOT_ID",
  646. "type": "ROOT"
  647. },
  648. "ROW--EyBZQlDi": {
  649. "children": [
  650. "CHART-_T6n_K9iQN",
  651. "CHART-6n9jxb30JG"
  652. ],
  653. "id": "ROW--EyBZQlDi",
  654. "meta": {
  655. "background": "BACKGROUND_TRANSPARENT"
  656. },
  657. "parents": [
  658. "ROOT_ID",
  659. "GRID_ID"
  660. ],
  661. "type": "ROW"
  662. },
  663. "ROW-2n0XgiHDgs": {
  664. "children": [
  665. "CHART-eNY0tcE_ic",
  666. "MARKDOWN-zaflB60tbC",
  667. "CHART-PAXUUqwmX9"
  668. ],
  669. "id": "ROW-2n0XgiHDgs",
  670. "meta": {
  671. "background": "BACKGROUND_TRANSPARENT"
  672. },
  673. "parents": [
  674. "ROOT_ID",
  675. "GRID_ID"
  676. ],
  677. "type": "ROW"
  678. },
  679. "ROW-eh0w37bWbR": {
  680. "children": [
  681. "CHART-g075mMgyYb",
  682. "CHART-n-zGGE6S1y",
  683. "CHART-6GdlekVise"
  684. ],
  685. "id": "ROW-eh0w37bWbR",
  686. "meta": {
  687. "background": "BACKGROUND_TRANSPARENT"
  688. },
  689. "parents": [
  690. "ROOT_ID",
  691. "GRID_ID"
  692. ],
  693. "type": "ROW"
  694. },
  695. "ROW-kzWtcvo8R1": {
  696. "children": [
  697. "CHART-vJIPjmcbD3",
  698. "CHART-Jj9qh1ol-N",
  699. "CHART-ODvantb_bF"
  700. ],
  701. "id": "ROW-kzWtcvo8R1",
  702. "meta": {
  703. "background": "BACKGROUND_TRANSPARENT"
  704. },
  705. "parents": [
  706. "ROOT_ID",
  707. "GRID_ID"
  708. ],
  709. "type": "ROW"
  710. }
  711. }
  712. """ # pylint: enable=line-too-long
  713. )
  714. pos = json.loads(js)
  715. # dashboard v2 doesn't allow add markup slice
  716. dash.slices = [slc for slc in slices if slc.viz_type != "markup"]
  717. update_slice_ids(pos, dash.slices)
  718. dash.dashboard_title = "USA Births Names"
  719. dash.position_json = json.dumps(pos, indent=4)
  720. dash.slug = "births"
  721. db.session.commit()