CaloCDB/scripts/runFittingProd.py

0001 #!/usr/bin/env python3
0002 import pandas as pd
0003 import psycopg2
0004 import os
0005
0006 def get_file_paths(threshold=500000):
0007     query = f"""
0008
0009     SELECT
0010         a.dataset, a.runnumber, f.full_file_path
0011     FROM
0012         datasets a
0013     JOIN (
0014             SELECT
0015                 d.dataset, d.runnumber
0016             FROM
0017                 datasets d
0018             JOIN (
0019                 SELECT
0020                     dataset, runnumber, segment
0021                 FROM
0022                     datasets
0023                 WHERE
0024                     dsttype LIKE 'HIST_CALOFITTINGQA%') h
0025             ON
0026                 d.dataset = h.dataset AND d.runnumber = h.runnumber AND d.segment = h.segment
0027             WHERE
0028                 d.dsttype LIKE 'DST_CALOFITTING_run2pp'
0029             GROUP BY
0030                 d.dataset, d.runnumber
0031             HAVING
0032                 SUM(d.events) > {threshold}) k
0033     ON
0034         k.dataset = a.dataset AND k.runnumber = a.runnumber
0035     JOIN
0036         files f
0037     ON
0038         f.lfn = a.filename
0039     WHERE
0040         a.filename LIKE 'HIST_CALOFITTINGQA%';
0041     """
0042
0043     with psycopg2.connect(f"dbname=FileCatalog") as conn:
0044         return pd.read_sql_query(query, conn)
0045
0046 def main():
0047     df = get_file_paths()
0048
0049     print(df)
0050     print(df.drop_duplicates(['runnumber','dataset']))
0051
0052     os.makedirs('files/hists',exist_ok=True)
0053
0054     if os.path.exists('completedruns.txt'):
0055         with open('completedruns.txt') as f:
0056             completed_runs_datasets = set(line.strip() for line in f)
0057     else:
0058         completed_runs_datasets = set()
0059
0060     fdf = df[~df.apply(lambda row: f"{row['runnumber']},{row['dataset']}" in completed_runs_datasets, axis=1)]
0061
0062     for index in fdf.drop_duplicates(['runnumber','dataset']).index:
0063         dataset = fdf["dataset"][index]
0064         run = fdf["runnumber"][index]
0065         print(f'Processing: {run},{dataset}')
0066
0067         with open('completedruns.txt', 'a') as f:
0068             f.write(f"{run},{dataset}\n")
0069
0070         fdf[(fdf['runnumber']==run) & (fdf['dataset']==dataset)]['full_file_path'].to_csv(f'files/hists/{run}_{dataset}.list', index=False, header=False)
0071
0072 if __name__ == "__main__":
0073     main()