Jet-Study/scripts/genCorruptSegments.py

0001 #!/usr/bin/env python3
0002 import subprocess
0003 import argparse
0004 import os
0005
0006 parser = argparse.ArgumentParser()
0007
0008 parser.add_argument('-i', '--log-dir', type=str, default='/sphenix/data/data02/sphnxpro/run2pp/calologs/ana446_2024p007', help='Location of the log directory.')
0009 parser.add_argument('-d', '--dataset', type=str, default='ana446_2024p007', help='Production Dataset.')
0010 parser.add_argument('-d2', '--dsttype', type=str, default='DST_TRIGGERED_EVENT_run2pp', help='Production Dst Type.')
0011 parser.add_argument('-o', '--output', type=str, default='bad-calologs-ana446-2024p007.list', help='Output list file of all files that have an error.')
0012 parser.add_argument('-o2', '--output-segments', type=str, default='bad-ana446-2024p007-segments.list', help='Output list file of all segments that have an error.')
0013 parser.add_argument('-o3', '--output-dir', type=str, default='bad-segments', help='Output directory to save all files.')
0014
0015 args = parser.parse_args()
0016
0017 if __name__ == '__main__':
0018     log_dir         = os.path.realpath(args.log_dir)
0019     dataset         = args.dataset
0020     dsttype         = args.dsttype
0021     output          = args.output
0022     output_segments = args.output_segments
0023     output_dir      = os.path.realpath(args.output_dir)
0024
0025     print(f'Log Dir: {log_dir}')
0026     print(f'Dataset: {dataset}')
0027     print(f'DST Type: {dsttype}')
0028     print(f'Output: {output}')
0029     print(f'Output Segments: {output_segments}')
0030     print(f'Output Directory: {output_dir}')
0031
0032     os.makedirs(output_dir,exist_ok=True)
0033     # remove output file before writing if it already exists
0034     if os.path.exists(f'{output_dir}/{output}'):
0035         os.remove(f'{output_dir}/{output}')
0036         print(f'File {output} deleted successfully.')
0037
0038     # generate the list of log files that contain Error
0039     command = f"""while read d; do
0040                     echo "Processing Dir: $d, $i"
0041                     /direct/sphenix+u/anarde/.cargo/bin/rg -l "Error" $d >> {output}
0042                     echo "logs: $(wc -l {output})"
0043                     i=$((i+1))
0044                 done < <(readlink -f {log_dir}/*)"""
0045     subprocess.run(['bash','-c',command],cwd=output_dir)
0046
0047     # sort the log file
0048     command = f'sort {output} -o {output}'
0049     subprocess.run(['bash','-c',command],cwd=output_dir)
0050
0051     # extract only the run-segments from the list of log files
0052     command = f'awk -F\'/\' \'{{print $NF}}\' {output} | cut -d"-" -f2,3 | cut -d "." -f1 | sort > {output_segments}'
0053     subprocess.run(['bash','-c',command],cwd=output_dir)
0054
0055     # get the list of produced DST
0056     command = f'psql FileCatalog -c "select filename from datasets where dataset = \'{dataset}\' and dsttype=\'{dsttype}\';" -At | cut -d "-" -f2,3 | cut -d"." -f1 | sort > {dsttype}_{dataset}.list'
0057     subprocess.run(['bash','-c',command],cwd=output_dir)
0058
0059     # find the list of segments that overlap between containing Error and produced
0060     produced_segments = f'{os.path.splitext(output_segments)[0]}-produced.list'
0061     command = f'comm -12 {dsttype}_{dataset}.list {output_segments} > {produced_segments}'
0062     subprocess.run(['bash','-c',command],cwd=output_dir)
0063
0064     # find the list of segments that overlap between containing Error and produced
0065     produced_logs = f'{os.path.splitext(output)[0]}-produced.list'
0066     command = f'/direct/sphenix+u/anarde/.cargo/bin/rg -Ff {produced_segments} {output} > {produced_logs}'
0067     subprocess.run(['bash','-c',command],cwd=output_dir)