Package PyFoam :: Package Infrastructure :: Module ClusterJob
[hide private]
[frames] | no frames]

Source Code for Module PyFoam.Infrastructure.ClusterJob

  1  #  ICE Revision: $Id$ 
  2  """Encapsulates all necessary things for a cluster-job, like setting 
  3  up, running, restarting""" 
  4   
  5  import os,sys,subprocess 
  6  from os import path,unlink 
  7  from threading import Thread,Lock,Timer 
  8   
  9  from PyFoam.Applications.Decomposer import Decomposer 
 10  from PyFoam.Applications.Runner import Runner 
 11  from PyFoam.Applications.SteadyRunner import SteadyRunner 
 12  from PyFoam.Applications.CloneCase import CloneCase 
 13  from PyFoam.Applications.FromTemplate import FromTemplate 
 14   
 15  from PyFoam.FoamInformation import changeFoamVersion 
 16  from PyFoam.FoamInformation import foamVersion as getFoamVersion 
 17  from PyFoam.Error import error,warning 
 18  from PyFoam import configuration as config 
 19  from PyFoam.FoamInformation import oldAppConvention as oldApp 
 20  from PyFoam.RunDictionary.SolutionDirectory import SolutionDirectory 
 21   
 22  from PyFoam.ThirdParty.six import print_,iteritems 
 23   
24 -def checkForMessageFromAbove(job):
25 if not job.listenToTimer: 26 return 27 28 if path.exists(job.stopFile()): 29 job.stopJob() 30 return 31 32 if path.exists(job.checkpointFile()): 33 job.writeCheckpoint() 34 35 job.timer=Timer(1.,checkForMessageFromAbove,args=[job]) 36 job.timer.start()
37 38
39 -class ClusterJob(object):
40 """ All Cluster-jobs are to be derived from this base-class 41 42 The actual jobs are implemented by overriding methods 43 44 There is a number of variables in this class that are used to 45 'communicate' information between the various stages""" 46
47 - def __init__(self, 48 basename, 49 arrayJob=False, 50 hardRestart=False, 51 autoParallel=True, 52 doAutoReconstruct=None, 53 foamVersion=None, 54 compileOption=None, 55 useFoamMPI=False, 56 multiRegion=False, 57 parameters={}, 58 isDecomposed=False):
59 """Initializes the Job 60 @param basename: Basis name of the job 61 @param arrayJob: this job is a parameter variation. The tasks 62 are identified by their task-id 63 @param hardRestart: treat the job as restarted 64 @param autoParallel: Parallelization is handled by the base-class 65 @param doAutoReconstruct: Automatically reconstruct the case if 66 autoParalellel is set. If the value is None then it is looked up from 67 the configuration 68 @param foamVersion: The foam-Version that is to be used 69 @param compileOption: Forces compile-option (usually 'Opt' or 'Debug') 70 @param useFoamMPI: Use the OpenMPI supplied with OpenFOAM 71 @param multiRegion: This job consists of multiple regions 72 @param parameters: Dictionary with parameters that are being passed to the Runner 73 @param isDecomposed: Assume that the job is already decomposed""" 74 75 # print_(os.environ) 76 77 if not "JOB_ID" in os.environ: 78 error("Not an SGE-job. Environment variable JOB_ID is missing") 79 self.jobID=int(os.environ["JOB_ID"]) 80 self.jobName=os.environ["JOB_NAME"] 81 82 self.basename=path.join(path.abspath(path.curdir),basename) 83 84 sgeRestarted=False 85 if "RESTARTED" in os.environ: 86 sgeRestarted=(int(os.environ["RESTARTED"])!=0) 87 88 if sgeRestarted or hardRestart: 89 self.restarted=True 90 else: 91 self.restarted=False 92 93 if foamVersion==None: 94 foamVersion=config().get("OpenFOAM","Version") 95 96 changeFoamVersion(foamVersion,compileOption=compileOption) 97 98 if not "WM_PROJECT_VERSION" in os.environ: 99 error("No OpenFOAM-Version seems to be configured. Set the foamVersion-parameter") 100 101 self.autoParallel=autoParallel 102 103 self.doAutoReconstruct=doAutoReconstruct 104 if self.doAutoReconstruct==None: 105 self.doAutoReconstruct=config().getboolean("ClusterJob","doAutoReconstruct") 106 107 self.multiRegion=multiRegion 108 109 self.parameters=parameters 110 111 self.hostfile=None 112 self.nproc=1 113 114 if "NSLOTS" in os.environ: 115 self.nproc=int(os.environ["NSLOTS"]) 116 self.message("Running on",self.nproc,"CPUs") 117 if self.nproc>1: 118 # self.hostfile=os.environ["PE_HOSTFILE"] 119 self.hostfile=path.join(os.environ["TMP"],"machines") 120 if config().getboolean("ClusterJob","useMachineFile"): 121 self.message("Using the machinefile",self.hostfile) 122 self.message("Contents of the machinefile:",open(self.hostfile).readlines()) 123 else: 124 self.message("No machinefile used because switched off with 'useMachineFile'") 125 126 self.ordinaryEnd=True 127 self.listenToTimer=False 128 129 self.taskID=None 130 self.arrayJob=arrayJob 131 132 if self.arrayJob: 133 self.taskID=int(os.environ["SGE_TASK_ID"]) 134 135 if not useFoamMPI and not foamVersion in eval(config().get("ClusterJob","useFoamMPI",default='[]')): 136 ## prepend special paths for the cluster 137 self.message("Adding Cluster-specific paths") 138 os.environ["PATH"]=config().get("ClusterJob","path")+":"+os.environ["PATH"] 139 os.environ["LD_LIBRARY_PATH"]=config().get("ClusterJob","ldpath")+":"+os.environ["LD_LIBRARY_PATH"] 140 141 self.isDecomposed=isDecomposed
142
143 - def fullJobId(self):
144 """Return a string with the full job-ID""" 145 result=str(self.jobID) 146 if self.arrayJob: 147 result+=":"+str(self.taskID) 148 return result
149
150 - def message(self,*txt):
151 print_("=== CLUSTERJOB: ",end="") 152 for t in txt: 153 print_(t,end=" ") 154 print_(" ===") 155 sys.stdout.flush()
156
157 - def setState(self,txt):
158 self.message("Setting Job state to",txt) 159 fName=path.join(self.casedir(),"ClusterJobState") 160 f=open(fName,"w") 161 f.write(txt+"\n") 162 f.close()
163
164 - def jobFile(self):
165 """The file with the job information""" 166 jobfile="%s.%d" % (self.jobName,self.jobID) 167 if self.arrayJob: 168 jobfile+=".%d" % self.taskID 169 jobfile+=".pyFoam.clusterjob" 170 jobfile=path.join(path.dirname(self.basename),jobfile) 171 172 return jobfile
173
174 - def checkpointFile(self):
175 """The file that makes the job write a checkpoint""" 176 return self.jobFile()+".checkpoint"
177
178 - def stopFile(self):
179 """The file that makes the job write a checkpoint and end""" 180 return self.jobFile()+".stop"
181
182 - def doIt(self):
183 """The central logic. Runs the job, sets it up etc""" 184 185 f=open(self.jobFile(),"w") 186 f.write(path.basename(self.basename)+"\n") 187 f.close() 188 189 self.message() 190 self.message("Running on directory",self.casename()) 191 self.message() 192 self.setState("Starting up") 193 194 if self.arrayJob: 195 for k,v in list(self.taskParameters(self.taskID).items()): 196 self.parameters[k]=v 197 198 self.parameters.update(self.additionalParameters()) 199 200 self.message("Parameters:",self.parameters) 201 if not self.restarted: 202 self.setState("Setting up") 203 self.setup(self.parameters) 204 if self.autoParallel and self.nproc>1 and not self.isDecomposed: 205 self.setState("Decomposing") 206 self.autoDecompose() 207 208 self.isDecomposed=True 209 210 self.setState("Setting up 2") 211 self.postDecomposeSetup(self.parameters) 212 else: 213 self.setState("Restarting") 214 215 self.isDecomposed=True 216 217 self.setState("Running") 218 self.listenToTimer=True 219 self.timer=Timer(1.,checkForMessageFromAbove,args=[self]) 220 self.timer.start() 221 222 self.run(self.parameters) 223 self.listenToTimer=False 224 225 if path.exists(self.jobFile()): 226 unlink(self.jobFile()) 227 228 if self.ordinaryEnd: 229 self.setState("Post Running") 230 self.preReconstructCleanup(self.parameters) 231 232 if self.autoParallel and self.nproc>1: 233 self.setState("Reconstructing") 234 self.autoReconstruct() 235 236 if self.nproc>0: 237 self.additionalReconstruct(self.parameters) 238 239 self.setState("Cleaning") 240 self.cleanup(self.parameters) 241 self.setState("Finished") 242 else: 243 self.setState("Suspended") 244 245 if path.exists(self.stopFile()): 246 unlink(self.stopFile()) 247 if path.exists(self.checkpointFile()): 248 unlink(self.checkpointFile())
249
250 - def casedir(self):
251 """Returns the actual directory of the case 252 To be overridden if appropriate""" 253 if self.arrayJob: 254 return "%s.%05d" % (self.basename,self.taskID) 255 else: 256 return self.basename
257
258 - def casename(self):
259 """Returns just the name of the case""" 260 return path.basename(self.casedir())
261
262 - def execute(self,cmd):
263 """Execute a shell command in the case directory. No checking done 264 @param cmd: the command as a string""" 265 oldDir=os.getcwd() 266 self.message("Changing directory to",self.casedir()) 267 os.chdir(self.casedir()) 268 self.message("Executing",cmd) 269 try: 270 retcode = subprocess.call(cmd,shell=True) 271 if retcode < 0: 272 self.message(cmd,"was terminated by signal", -retcode) 273 else: 274 self.message(cmd,"returned", retcode) 275 except OSError: 276 e = sys.exc_info()[1] # Needed because python 2.5 does not support 'as e' 277 self.message(cmd,"Execution failed:", e) 278 279 self.message("Executiong of",cmd,"ended") 280 self.message("Changing directory back to",oldDir) 281 os.chdir(oldDir)
282
283 - def templateFile(self,fileName):
284 """Looks for a template file and evaluates the template using 285 the usual parameters 286 @param fileName: the name of the file that will be 287 constructed. The template file is the same plus the extension '.template'""" 288 289 self.message("Building file",fileName,"from template with parameters", 290 self.parameters) 291 292 argList=["--output-file=%s" % path.join(self.casedir(),fileName), 293 "--dump-used-values" 294 ] 295 296 tmpl=FromTemplate(args=argList, 297 parameters=self.parameters)
298
299 - def foamRun(self,application, 300 args=[], 301 foamArgs=[], 302 steady=False, 303 multiRegion=True, 304 progress=False, 305 compress=False, 306 noLog=False):
307 """Runs a foam utility on the case. 308 If it is a parallel job and the grid has 309 already been decomposed (and not yet reconstructed) it is run in 310 parallel 311 @param application: the Foam-Application that is to be run 312 @param foamArgs: A list if with the additional arguments for the 313 Foam-Application 314 @param compress: Compress the log-file 315 @param args: A list with additional arguments for the Runner-object 316 @param steady: Use the steady-runner 317 @param multiRegion: Run this on multiple regions (if None: I don't have an opinion on this) 318 @param progress: Only output the time and nothing else 319 @param noLog: Do not generate a logfile""" 320 321 arglist=args[:] 322 arglist+=["--job-id=%s" % self.fullJobId()] 323 for k,v in iteritems(self.parameters): 324 arglist+=["--parameter=%s:%s" % (str(k),str(v))] 325 326 if self.isDecomposed and self.nproc>1: 327 arglist+=["--procnr=%d" % self.nproc] 328 if config().getboolean("ClusterJob","useMachineFile"): 329 arglist+=["--machinefile=%s" % self.hostfile] 330 331 arglist+=["--echo-command-prefix='=== Executing'"] 332 333 if progress: 334 arglist+=["--progress"] 335 if noLog: 336 arglist+=["--no-log"] 337 if compress: 338 arglist+=["--compress"] 339 340 if self.multiRegion: 341 if multiRegion: 342 arglist+=["--all-regions"] 343 elif multiRegion: 344 warning("This is not a multi-region case, so trying to run stuff multi-region won't do any good") 345 346 if self.restarted: 347 arglist+=["--restart"] 348 349 arglist+=[application] 350 if oldApp(): 351 arglist+=[".",self.casename()] 352 else: 353 arglist+=["-case",self.casename()] 354 355 arglist+=foamArgs 356 357 self.message("Executing",arglist) 358 359 if steady: 360 self.message("Running Steady") 361 runner=SteadyRunner(args=arglist) 362 else: 363 runner=Runner(args=arglist)
364
365 - def autoDecompose(self):
366 """Automatically decomposes the grid with a metis-algorithm""" 367 368 if path.isdir(path.join(self.casedir(),"processor0")): 369 warning("A processor directory already exists. There might be a problem") 370 371 defaultMethod="metis" 372 373 if getFoamVersion()>=(1,6): 374 defaultMethod="scotch" 375 376 args=["--method="+defaultMethod, 377 "--clear", 378 self.casename(), 379 self.nproc, 380 "--job-id=%s" % self.fullJobId()] 381 382 if self.multiRegion: 383 args.append("--all-regions") 384 385 deco=Decomposer(args=args)
386
387 - def autoReconstruct(self):
388 """Default reconstruction of a parallel run""" 389 390 if self.doAutoReconstruct: 391 self.isDecomposed=False 392 393 self.foamRun("reconstructPar", 394 args=["--logname=ReconstructPar"]) 395 else: 396 self.message("No reconstruction (because asked to)")
397
398 - def setup(self,parameters):
399 """Set up the job. Called in the beginning if the 400 job has not been restarted 401 402 Usual tasks include grid conversion/setup, mesh decomposition etc 403 404 @param parameters: a dictionary with parameters""" 405 406 pass
407
408 - def postDecomposeSetup(self,parameters):
409 """Additional setup, to be executed when the grid is already decomposed 410 411 Usually for tasks that can be done on a decomposed grid 412 413 @param parameters: a dictionary with parameters""" 414 415 pass
416
417 - def run(self,parameters):
418 """Run the actual job. Usually the solver. 419 @param parameters: a dictionary with parameters""" 420 421 pass
422
423 - def preReconstructCleanup(self,parameters):
424 """Additional cleanup, to be executed when the grid is still decomposed 425 426 Usually for tasks that can be done on a decomposed grid 427 428 @param parameters: a dictionary with parameters""" 429 430 pass
431
432 - def cleanup(self,parameters):
433 """Clean up after a job 434 @param parameters: a dictionary with parameters""" 435 436 pass
437
438 - def additionalReconstruct(self,parameters):
439 """Additional reconstruction of parallel runs (Stuff that the 440 OpenFOAM-reconstructPar doesn't do 441 @param parameters: a dictionary with parameters""" 442 443 pass
444
445 - def taskParameters(self,id):
446 """Parameters for a specific task 447 @param id: the id of the task 448 @return: a dictionary with parameters for this task""" 449 450 error("taskParameter not implemented. Not a parameterized job") 451 452 return {}
453
454 - def additionalParameters(self):
455 """Additional parameters 456 @return: a dictionary with parameters for this task""" 457 458 warning("Method 'additionalParameters' not implemented. Not a problem. Just saying") 459 460 return {}
461
462 - def writeCheckpoint(self):
463 if self.listenToTimer: 464 f=open(path.join(self.basename,"write"),"w") 465 f.write("Jetzt will ich's wissen") 466 f.close() 467 unlink(self.checkpointFile()) 468 else: 469 warning("I'm not listening to your callbacks") 470 471 self.timer=Timer(1.,checkForMessageFromAbove,args=[self])
472
473 - def stopJob(self):
474 if self.listenToTimer: 475 self.ordinaryEnd=False 476 f=open(path.join(self.basename,"stop"),"w") 477 f.write("Geh z'haus") 478 f.close() 479 unlink(self.stopFile()) 480 else: 481 warning("I'm not listening to your callbacks")
482
483 -class SolverJob(ClusterJob):
484 """A Cluster-Job that executes a solver. It implements the run-function. 485 If a template-case is specified, the case is copied""" 486
487 - def __init__(self,basename,solver, 488 template=None, 489 cloneParameters=[], 490 arrayJob=False, 491 hardRestart=False, 492 autoParallel=True, 493 doAutoReconstruct=None, 494 foamVersion=None, 495 compileOption=None, 496 useFoamMPI=False, 497 steady=False, 498 multiRegion=False, 499 parameters={}, 500 progress=False, 501 solverProgress=False, 502 solverNoLog=False, 503 solverLogCompress=False, 504 isDecomposed=False):
505 """@param template: Name of the template-case. It is assumed that 506 it resides in the same directory as the actual case 507 @param cloneParameters: a list with additional parameters for the 508 CloneCase-object that copies the template 509 @param solverProgress: Only writes the current time of the solver""" 510 511 ClusterJob.__init__(self,basename, 512 arrayJob=arrayJob, 513 hardRestart=hardRestart, 514 autoParallel=autoParallel, 515 doAutoReconstruct=doAutoReconstruct, 516 foamVersion=foamVersion, 517 compileOption=compileOption, 518 useFoamMPI=useFoamMPI, 519 multiRegion=multiRegion, 520 parameters=parameters, 521 isDecomposed=isDecomposed) 522 self.solver=solver 523 self.steady=steady 524 if template!=None and not self.restarted: 525 template=path.join(path.dirname(self.casedir()),template) 526 if path.abspath(basename)==path.abspath(template): 527 error("The basename",basename,"and the template",template,"are the same directory") 528 if isDecomposed: 529 cloneParameters+=["--parallel"] 530 clone=CloneCase( 531 args=cloneParameters+[template,self.casedir(),"--follow-symlinks"]) 532 self.solverProgress=solverProgress 533 self.solverNoLog=solverNoLog 534 self.solverLogCompress=solverLogCompress
535
536 - def run(self,parameters):
537 self.foamRun(self.solver, 538 steady=self.steady, 539 multiRegion=False, 540 progress=self.solverProgress, 541 noLog=self.solverNoLog, 542 compress=self.solverLogCompress)
543 544 # Should work with Python3 and Python2 545