Skip to content

Commit

Permalink
big bang
Browse files Browse the repository at this point in the history
  • Loading branch information
vitor authored and vitor committed Feb 20, 2024
1 parent 44b1a51 commit 59ae2bf
Show file tree
Hide file tree
Showing 4 changed files with 339 additions and 0 deletions.
11 changes: 11 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM python:3.12.2-slim-bullseye

WORKDIR /app

COPY app /app

RUN pip3 install -r requirements.txt

EXPOSE 8501

ENTRYPOINT ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
5 changes: 5 additions & 0 deletions app/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pandas==2.2.0
streamlit==1.31.0
plotly==5.19.0
sqlalchemy==2.0.27
psycopg2-binary==2.9.9
301 changes: 301 additions & 0 deletions app/streamlit_app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,301 @@
import os
import pandas as pd
import streamlit as st
import plotly.express as px
from sqlalchemy import create_engine
from sqlalchemy.engine import URL


st.set_page_config(
page_title="GêXis Gaúcho",
page_icon="🍔",
layout="wide",
)


@st.cache_data
def load_data():
url_object = URL.create(
"postgresql+psycopg2",
username=os.environ["LOGIN"],
password=os.environ["PASSWORD"],
host=os.environ["HOST"],
port=os.environ["PORT"],
database=os.environ["DATABASE"],
)
engine = create_engine(url_object)
df = pd.read_sql_table(
"ge_validations_store_normalized", con=engine, schema="great_expectations"
)
df["meta.validation_time"] = pd.to_datetime(
df["meta.validation_time"], format="%Y%m%dT%H%M%S.%fZ"
)
df["meta.validation_date"] = df["meta.validation_time"].apply(lambda x: x.date())
df["meta.validation_date"] = pd.to_datetime(df["meta.validation_date"])
df["meta.validation_yearmonth"] = df["meta.validation_time"].apply(
lambda x: pd.Timestamp(f"{x.year}-{x.month}-01")
)
df["schema_table_name"] = (
df["meta.batch_spec.schema_name"] + "." + df["meta.batch_spec.table_name"]
)

return df


data = load_data()

"""
# 🍔 GêXis Gaúcho
---
"""

col1, col2 = st.columns([1.5, 2.5])

with col1:
"""
## Últimas validation :red[failed]
"""

last_run = data.groupby(["schema_table_name"])["meta.validation_time"].max()
last_run = last_run.reset_index()
last_run_full = last_run.merge(
data,
on=[
"schema_table_name",
"meta.validation_time",
],
how="inner",
)

st.dataframe(
last_run_full[~last_run_full["success"]].sort_values(
by="meta.validation_time", ascending=False
),
column_order=[
"meta.validation_time",
"schema_table_name",
],
column_config={
"schema_table_name": st.column_config.TextColumn(
"schema.table",
),
"meta.validation_time": st.column_config.DatetimeColumn(
"Validation date",
format="DD/MM/YYYY",
),
},
hide_index=True,
)


with col2:
"""
## Percentual de sucesso por dia
"""

count_success = (
data.groupby("meta.validation_date")["success"]
.value_counts()
.unstack(fill_value=0)
)
count_success["success_percentual"] = count_success[True] / (
count_success[True] + count_success[False]
)
count_success = count_success.reset_index()

fig1 = px.bar(count_success, x="meta.validation_date", y="success_percentual")
fig1.update_layout(
width=800,
)
fig1.update_traces(marker_color="#F63366", marker_line_width=0)

st.plotly_chart(fig1, theme="streamlit")

"""
---
"""

col3, col4 = st.columns([1.5, 2.5])

with col3:
"""
## Top-10 `schema.table` errors
"""

top_fail = (
data[~data["success"]]
.groupby(["schema_table_name"])["success"]
.value_counts()
.unstack(fill_value=0)
.sort_values(False, ascending=False)
.head(10)
)
top_fail = top_fail.reset_index()

st.dataframe(
top_fail,
column_config={
"schema_table_name": st.column_config.TextColumn(
"schema.table",
),
"False": st.column_config.NumberColumn(
"Qt. Erros",
format="%d",
),
},
)

with col4:
"""
## Top-10 `schema.table` errors por mês
"""

top_fail_time = (
data[~data["success"]]
.groupby(["schema_table_name", "meta.validation_yearmonth"])["success"]
.count()
)
top_fail_time = top_fail_time.reset_index()

top_fail_time_pivot = top_fail_time[
top_fail_time["schema_table_name"].isin(top_fail["schema_table_name"])
].pivot_table(
index="meta.validation_yearmonth",
columns="schema_table_name",
values="success",
fill_value=0,
)
top_fail_time_pivot = top_fail_time_pivot.reset_index()

fig2 = px.line(
top_fail_time_pivot,
x="meta.validation_yearmonth",
y=top_fail_time_pivot.columns,
#XXX check this
hover_data={"meta.validation_yearmonth": "|%B %d, %Y"},
)
#XXX check this
fig2.update_xaxes(dtick="M1", tickformat="%b\n%Y")
fig2.update_layout(
width=800,
)

st.plotly_chart(fig2, theme="streamlit")

"""
---
## Raw data
"""

col5, col6, col7, col8 = st.columns(4)

with col5:
which_schema = st.selectbox(
"Schema",
options=sorted(data["meta.batch_spec.schema_name"].unique()),
index=None,
)

with col6:
which_table = st.selectbox(
"Table",
options=(
sorted(
data.loc[
data["meta.batch_spec.schema_name"] == which_schema,
"meta.batch_spec.table_name",
].unique()
)
if which_schema
else sorted(data["meta.batch_spec.table_name"].unique())
),
index=None,
)

with col7:
which_date_min, which_date_max = st.date_input(
"Validation date range",
(data["meta.validation_time"].min(), data["meta.validation_time"].max()),
format="DD/MM/YYYY",
)

with col8:
is_success = st.multiselect("is_success", [True, False], [True, False])

st.markdown(
f"""
You selected: `{which_schema}`, `{which_table}`, `{which_date_min}`, `{which_date_max}`, `{is_success}`
"""
)

if which_schema and not which_table:
df_raw = data[
(data["meta.batch_spec.schema_name"] == which_schema)
& (data["meta.validation_time"].dt.date >= which_date_min)
& (data["meta.validation_time"].dt.date <= which_date_max)
& (data["success"].isin(is_success))
]
elif which_table:
df_raw = data[
(data["meta.batch_spec.table_name"] == which_table)
& (data["meta.validation_time"].dt.date >= which_date_min)
& (data["meta.validation_time"].dt.date <= which_date_max)
& (data["success"].isin(is_success))
]
else:
df_raw = data[
(data["meta.validation_time"].dt.date >= which_date_min)
& (data["meta.validation_time"].dt.date <= which_date_max)
& (data["success"].isin(is_success))
]

st.dataframe(
df_raw,
column_order=[
"success",
"meta.validation_time",
"meta.batch_spec.schema_name",
"meta.batch_spec.table_name",
"meta.active_batch_definition.datasource_name",
"expectation_config.expectation_type",
"expectation_config.kwargs.max_value",
"expectation_config.kwargs.min_value",
"result.observed_value",
],
column_config={
"success": st.column_config.TextColumn(
"Rodou?",
),
"meta.validation_time": st.column_config.DatetimeColumn(
"Validation time",
format="DD/MM/YYYY",
),
"meta.batch_spec.schema_name": st.column_config.TextColumn(
"Schema",
),
"meta.batch_spec.table_name": st.column_config.TextColumn(
"Table",
),
"meta.active_batch_definition.datasource_name": st.column_config.TextColumn(
"Conn",
),
"expectation_config.expectation_type": st.column_config.TextColumn(
"Expectation type",
),
"expectation_config.kwargs.max_value": st.column_config.NumberColumn(
"Max value allowed",
format="%d",
),
"expectation_config.kwargs.min_value": st.column_config.NumberColumn(
"Min value allowed",
format="%d",
),
"result.observed_value": st.column_config.NumberColumn(
"Value observed",
format="%d",
),
},
width=1400,
hide_index=True,
)
22 changes: 22 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
version: '3'

services:
streamlit:
build:
context: .
environment:
LOGIN: ""
PASSWORD: ""
HOST: ""
PORT: ""
DATABASE: ""
ports:
- 8501:8501
volumes:
- ./app:/app
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:8501/_stcore/health"]
interval: 10s
timeout: 10s
retries: 5
restart: always

0 comments on commit 59ae2bf

Please sign in to comment.