#!/bin/python3 ''' Plot some project statistics based upon .difxlog and .machines files in one or more project top level directories. Produces histograms of "wallclock time" durations of the DiFX job, number of nodes per job, the idle time between jobs, as well as a scatter plot of job duration vs its node count. Allows comparisons between DiFX runs of the same project run with different cluster node allocations, and also different projects of generally the same kind e.g. IVS R1. ''' import matplotlib.pyplot as plt import numpy as np import argparse import datetime import glob import os, sys __author__ = "Jan Wagner (MPIfR)" __version__ = "1.0.1" def parse_args(args: []): parser = argparse.ArgumentParser(description=__doc__, add_help=True, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-f', '--maxfiles', help='Inspect at most files per project dir (0=unlimited)', default=0, type=int) parser.add_argument('-r', '--maxruntime', help='Max expected job run time (default: %(default)d)', default=150, type=int) parser.add_argument('-i', '--maxidletime', help='Max expected idle time between jobs (default: %(default)d)', default=180, type=int) parser.add_argument('-n', '--maxnodes', help='Max number of nodes expected (default: %(default)d)', default=70, type=int) parser.add_argument('-d', '--use-dirname', help='Label data not by project but by subdirectory', action='store_true') parser.add_argument('directories', nargs='*') return parser.parse_args(args) def getStartingTime(difxlog): ''' Open 'difxlog' and get the timestamp of the first entry. Return it as a Unix time ie seconds since Unix Epoch. ''' # Note, none of os.path.getatime, .getmtime, .getctime, # nor os.stat (C fstat()) return a file creation date, # only at best the last file modification time. # Hence, read the time from the difx log instead. with open(difxlog, 'r') as df: line = df.readline() # line = "Fri Aug 26 10:54:20 2022 2 io02 INFO MPI Process 2 is running on host io02" # line = "Tue Apr 5 17:13:37 2022 16 node15.service INFO MPI Process 16 is running on host node15.service" try: tstamp = datetime.datetime.strptime(line[:25].strip(), '%a %b %d %H:%M:%S %Y') return tstamp.timestamp() except ValueError: print ("Could not parse time from %s entry '%s'" % (difxlog, line)) return 0 def getWallclockTime(difxlog): ''' Open 'difxlog' and look for a line similar to INFO Total wallclock time was **5.10744** seconds NB: Since .difxlog get appended, its possible to encounter multiple wallclock time entries. Use the last one found. ''' t = 0 with open(difxlog, 'r') as df: for line in df.readlines(): if 'wallclock time' in line: t = float( line.split('**')[1] ) return t def getNodeCount(difxlog): ''' Check .machines file corresponding to the difxlog, return the number of nodes (nr of lines) in the machinesfile ''' machinesfile = difxlog.split('.difxlog')[0] + '.machines' with open(machinesfile, 'r') as mf: lines = mf.readlines() return len(lines) return 0 def getDataSeries(path, Nmax=0): ''' Return various data for all (or at most Nmax) .difxlog logs and related files that can be found under the given 'path'. ''' starttimes, wallclocktimes, nodecounts = [], [], [] for difxlog in glob.iglob(path + '/*.difxlog'): t = getStartingTime(difxlog) wt = getWallclockTime(difxlog) nc = getNodeCount(difxlog) # print('From ' + difxlog + ' got ', t, wt, nc) if t <= 0 or wt <= 0 or nc <= 0: # print('From ' + difxlog + ' got skippable data ', t, wt, nc) continue starttimes.append(t) wallclocktimes.append(wt) nodecounts.append(nc) if Nmax > 0 and len(starttimes) == Nmax: break return (starttimes,wallclocktimes, nodecounts) def getExperimentName(path): '''Return the name of experiment, determined from the prefix of the first log file (_r.difxlog)''' for difxlog in glob.iglob(path + '/*.difxlog'): logname = os.path.split(difxlog)[1] # eg /path/r1999_0013.difxlog => r1999_0013.difxlog basename = logname.split('.difxlog')[0] # eg r1999_0013.difxlog => r1999_0013 exptname = basename.split('_')[0] return exptname return 'n/a' def getExperimentSubdirname(path): ''' Return "subdir" part of 'path'. Can be used to label plots of the same experiment correlated multiple times. ''' s = path.split('/') if len(s[-1]) > 0: # format was /path/subdir (no trailing slash) return s[-1] # format was /path/subdir/ return s[-2] if __name__ == "__main__": userargs = parse_args(sys.argv[1:]) if len(userargs.directories) < 1: sys.exit(0) # Bins bins_time = range(0, userargs.maxruntime, 5) bins_idletime = range(0, userargs.maxidletime, 5) bins_numnodes = range(0, userargs.maxnodes, 2) # Data and plots fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2, 2, constrained_layout=True) Ndrawn = 0 for path in userargs.directories: if userargs.use_dirname: expt = getExperimentSubdirname(path) else: expt = getExperimentName(path) data_tstartUnix, data_twall, data_nodecount = getDataSeries(path, Nmax=userargs.maxfiles) if len(data_tstartUnix) <= 0: print('No DiFX log files in directory ' + path) continue iasc = np.argsort(data_tstartUnix) # indices that provide data_tstartUnix sorted by increasing time tstartUnixSorted = [data_tstartUnix[ii] for ii in iasc] tendUnixSorted = [data_tstartUnix[ii] + data_twall[ii] for ii in iasc] t_gaps = [tstartUnixSorted[i+1] - tendUnixSorted[i] for i in range(len(tstartUnixSorted)-1)] ax1.hist(data_twall, bins_time, alpha=0.5, label=expt) ax2.hist(data_nodecount, bins_numnodes, alpha=0.5, label=expt) ax3.scatter(data_nodecount, data_twall, alpha=0.5, label=expt) ax4.hist(t_gaps, bins_idletime, alpha=0.5, label=expt) plt.draw() Ndrawn += 1 if max(data_twall) > userargs.maxruntime: print("Warning: '%s' logs had max wallclock duration of %d sec, exceeds %d sec binning. Try --maxruntime %d" % (path, max(data_twall), userargs.maxruntime, 60*int(1 + max(data_twall)/60))) # Early exit if no data if Ndrawn <= 0: sys.exit(0) # Captions ax1.set_title("Distribution of job durations") ax1.legend(loc='upper right') ax1.set_xlabel('Time (seconds)') ax2.set_title("Distribution of nodes per job") ax2.legend(loc='upper right') ax2.set_xlabel('Nodes used (#)') ax3.set_title("Job duration against nodes allocated") ax3.legend(loc='upper right') ax3.set_xlabel('Nodes used (#)') ax3.set_ylabel('Time (seconds)') ax4.set_title("Distribution of time idle between jobs") ax4.legend(loc='upper right') ax4.set_xlabel('Time (seconds)') fig.set_size_inches(10, 6) plt.show()