1st commit

This commit is contained in:
Ankit Malik
2026-06-25 11:20:22 +05:30
parent 6b2d754981
commit 1d5ad2d793
14 changed files with 2322 additions and 620 deletions
+7 -226
View File
@@ -46,113 +46,13 @@ from mids import (
from src.bridge import *
from src.fact import *
from src.dim import *
# ==========================================================
# Helpers
# ==========================================================
def get_dates_from_yaml(filename: str):
with open(filename, "r") as file:
data = yaml.safe_load(file)
start_date = date.fromisoformat(
str(data["pipeline"]["start_date"])
)
end_date = date.fromisoformat(
str(data["pipeline"]["end_date"])
)
flag=str(data["pipeline"]["flag"])
return start_date, end_date , flag
def write_table_to_yaml(
data: dict,
run_date: date,
filename: str | None = None
):
"""Write table data to a YAML file."""
if filename is None:
filename = f"elt_pipeline_{run_date}.yml"
with open(filename, "w") as file:
yaml.dump(
data,
file,
default_flow_style=False,
sort_keys=False
)
print(f"Table written to {filename}")
from elt import *
def table_exists(
client,
table_name: str,
) -> bool:
return bool(
client.command(
f"EXISTS TABLE {table_name}"
)
)
# ==========================================================
# Main
# ==========================================================
def elt(run_date : date):
log.info("=" * 80)
log.info("Hello from data-move Python data pipeline!")
# ------------------------------------------------------
# Run Date
# ------------------------------------------------------
log.info(
"Pipeline Run Date: %s",
run_date,
)
# ------------------------------------------------------
# Connections
# ------------------------------------------------------
log.info(
"Connecting to databases..."
)
sql_engine = build_sql_server_engine()
clickhouse_engine = build_clickhouse_engine()
client = get_clickhouse_client()
log.info(
"Database connections established"
)
# ------------------------------------------------------
# mids Keys
# ------------------------------------------------------
mids = MID_TABLE_COV(
sql_engine,
run_date,
)
emp_visit_df = MID_TABLE_COV1(
sql_engine,
run_date,
)
def main() :
# ------------------------------------------------------
# Config
@@ -163,128 +63,8 @@ def elt(run_date : date):
"r",
) as file:
config = yaml.safe_load(file)
table_config = yaml.safe_load(file)
# ------------------------------------------------------
# Process Tables
# ------------------------------------------------------
for table in config["tables"]:
table_name = table["name"]
operation = table["operation"]
fetch_by = table["fetch_by"]
table_type=table["type"]
log.info("=" * 80)
log.info(f"Processing Table-:{table_name} | Table type -:{table_type} | fetcht by-:{fetch_by} | operation-:{operation}" )
try:
# ------------------------------------------
# Fetch Data
# ------------------------------------------
log.info(f"Fetching Data from sql server for table-: {table_name} ..............")
fetch_list=["mids" ,"run_date", "reason_id"]
if fetch_by in fetch_list :
fn_name = f"fetch_{table_name}"
fn = globals()[fn_name]
df=fn(sql_engine, table_name , table_type, mids, run_date)
else:
df = fetch_data(sql_engine ,table_name,table_type)
log.info(f"Fetched total row -: {len(df)} from sql server for table-:{table_name} ...........!!!")
if df.is_empty():
log.warning(
"%s returned no rows",
table_name,
)
continue
log.info(
"Fetched %s rows",
len(df),
)
# ------------------------------------------
# Create Table If Missing
# ------------------------------------------
exists = table_exists(
client,
table_name,
)
if not exists:
log.info(
"Creating table %s",
table_name,
)
create_clickhouse_table(
df=df,
table_name=table_name,
clickhouse_engine=clickhouse_engine,
)
# ------------------------------------------
# Existing Table Logic
# ------------------------------------------
else:
if operation == "DELETE+INSERT":
truncate_table(
client,
table_name,
)
else:
delete_existing_data(
client=client,
table_name=table_name,
run_date=run_date,
mids=mids,
emp_visit_df=emp_visit_df,
)
# ------------------------------------------
# Load Data
# ------------------------------------------
log.info("_ _ _ _Inserting data into clickhouse db from sql server_ _ _ _")
load_to_clickhouse(
client=client,
table_name=table_name,
df=df,
)
log.info(
"%s loaded successfully (%s rows)",
table_name,
len(df),
)
except Exception:
log.exception(
"Failed processing table %s",
table_name,
)
raise
log.info("=" * 80)
log.info("Pipeline Completed Successfully")
log.info("=" * 80)
def main() :
config_file = Path("Pipeline_config.yml")
@@ -336,7 +116,7 @@ def main() :
for attempt in range(3):
try:
elt(run_date)
elt(run_date , table_config)
successful_dates.append({
'pipeline_trigeered_on_date': str(date.today()),
@@ -370,7 +150,7 @@ def main() :
)
sleep(5)
start_date=start_date + timedelta(days=1)
@@ -392,7 +172,8 @@ def main() :
with open(filename_failed, "w") as f:
yaml.dump(failed_dates,
f, default_flow_style=False,
sort_keys=False)
sort_keys=False)
if __name__ == "__main__":