File indexing completed on 2025-08-05 08:11:10
0001
0002 import pandas as pd
0003 import psycopg2
0004 import os
0005
0006 def get_file_paths(threshold=500000):
0007 query = f"""
0008
0009 SELECT
0010 a.dataset, a.runnumber, f.full_file_path
0011 FROM
0012 datasets a
0013 JOIN (
0014 SELECT
0015 d.dataset, d.runnumber
0016 FROM
0017 datasets d
0018 JOIN (
0019 SELECT
0020 dataset, runnumber, segment
0021 FROM
0022 datasets
0023 WHERE
0024 dsttype LIKE 'HIST_CALOFITTINGQA%') h
0025 ON
0026 d.dataset = h.dataset AND d.runnumber = h.runnumber AND d.segment = h.segment
0027 WHERE
0028 d.dsttype LIKE 'DST_CALOFITTING_run2pp'
0029 GROUP BY
0030 d.dataset, d.runnumber
0031 HAVING
0032 SUM(d.events) > {threshold}) k
0033 ON
0034 k.dataset = a.dataset AND k.runnumber = a.runnumber
0035 JOIN
0036 files f
0037 ON
0038 f.lfn = a.filename
0039 WHERE
0040 a.filename LIKE 'HIST_CALOFITTINGQA%';
0041 """
0042
0043 with psycopg2.connect(f"dbname=FileCatalog") as conn:
0044 return pd.read_sql_query(query, conn)
0045
0046 def main():
0047 df = get_file_paths()
0048
0049 print(df)
0050 print(df.drop_duplicates(['runnumber','dataset']))
0051
0052 os.makedirs('files/hists',exist_ok=True)
0053
0054 if os.path.exists('completedruns.txt'):
0055 with open('completedruns.txt') as f:
0056 completed_runs_datasets = set(line.strip() for line in f)
0057 else:
0058 completed_runs_datasets = set()
0059
0060 fdf = df[~df.apply(lambda row: f"{row['runnumber']},{row['dataset']}" in completed_runs_datasets, axis=1)]
0061
0062 for index in fdf.drop_duplicates(['runnumber','dataset']).index:
0063 dataset = fdf["dataset"][index]
0064 run = fdf["runnumber"][index]
0065 print(f'Processing: {run},{dataset}')
0066
0067 with open('completedruns.txt', 'a') as f:
0068 f.write(f"{run},{dataset}\n")
0069
0070 fdf[(fdf['runnumber']==run) & (fdf['dataset']==dataset)]['full_file_path'].to_csv(f'files/hists/{run}_{dataset}.list', index=False, header=False)
0071
0072 if __name__ == "__main__":
0073 main()