Package PyFoam :: Package Infrastructure :: Module ClusterJob
[hide private]
[frames] | no frames]

Source Code for Module PyFoam.Infrastructure.ClusterJob

  1  #  ICE Revision: $Id: /local/openfoam/Python/PyFoam/PyFoam/Infrastructure/Logging.py 1906 2007-08-28T16:16:19.392553Z bgschaid  $  
  2  """Encapsulates all necessary things for a cluster-job, like setting up, running, restarting""" 
  3   
  4  import os,sys 
  5  from os import path,unlink 
  6  from threading import Thread,Lock,Timer 
  7   
  8  from PyFoam.Applications.Decomposer import Decomposer 
  9  from PyFoam.Applications.Runner import Runner 
 10  from PyFoam.Applications.SteadyRunner import SteadyRunner 
 11  from PyFoam.Applications.CloneCase import CloneCase 
 12  from PyFoam.FoamInformation import changeFoamVersion 
 13  from PyFoam.Error import error,warning 
 14  from PyFoam import configuration as config 
 15  from PyFoam.FoamInformation import oldAppConvention as oldApp 
 16   
17 -def checkForMessageFromAbove(job):
18 if not job.listenToTimer: 19 return 20 21 if path.exists(job.stopFile()): 22 job.stopJob() 23 return 24 25 if path.exists(job.checkpointFile()): 26 job.writeCheckpoint() 27 28 job.timer=Timer(1.,checkForMessageFromAbove,args=[job]) 29 job.timer.start()
30 31
32 -class ClusterJob:
33 """ All Cluster-jobs are to be derived from this base-class 34 35 The actual jobs are implemented by overriding methods 36 37 There is a number of variables in this class that are used to 38 'communicate' information between the various stages""" 39
40 - def __init__(self,basename, 41 arrayJob=False, 42 hardRestart=False, 43 autoParallel=True, 44 foamVersion=None, 45 useFoamMPI=False, 46 multiRegion=False):
47 """Initializes the Job 48 @param basename: Basis name of the job 49 @param arrayJob: this job is a parameter variation. The tasks 50 are identified by their task-id 51 @param hardRestart: treat the job as restarted 52 @param autoParallel: Parallelization is handled by the base-class 53 @param foamVersion: The foam-Version that is to be used 54 @param useFoamMPI: Use the OpenMPI supplied with OpenFOAM 55 @param multiRegion: This job consists of multiple regions""" 56 57 # print os.environ 58 59 if not os.environ.has_key("JOB_ID"): 60 error("Not an SGE-job. Environment variable JOB_ID is missing") 61 self.jobID=int(os.environ["JOB_ID"]) 62 self.jobName=os.environ["JOB_NAME"] 63 64 self.basename=path.join(path.abspath(path.curdir),basename) 65 66 sgeRestarted=False 67 if os.environ.has_key("RESTARTED"): 68 sgeRestarted=(int(os.environ["RESTARTED"])!=0) 69 70 if sgeRestarted or hardRestart: 71 self.restarted=True 72 else: 73 self.restarted=False 74 75 if foamVersion==None: 76 foamVersion=config().get("OpenFOAM","Version") 77 78 changeFoamVersion(foamVersion) 79 80 if not os.environ.has_key("WM_PROJECT_VERSION"): 81 error("No OpenFOAM-Version seems to be configured. Set the foamVersion-parameter") 82 83 self.autoParallel=autoParallel 84 self.multiRegion=multiRegion 85 86 self.hostfile=None 87 self.nproc=1 88 89 if os.environ.has_key("NSLOTS"): 90 self.nproc=int(os.environ["NSLOTS"]) 91 self.message("Running on",self.nproc,"CPUs") 92 if self.nproc>1: 93 # self.hostfile=os.environ["PE_HOSTFILE"] 94 self.hostfile=path.join(os.environ["TMP"],"machines") 95 self.message("Using the machinefile",self.hostfile) 96 self.message("Contents of the machinefile:",open(self.hostfile).readlines()) 97 98 self.ordinaryEnd=True 99 self.listenToTimer=False 100 101 self.taskID=None 102 self.arrayJob=arrayJob 103 104 if self.arrayJob: 105 self.taskID=int(os.environ["SGE_TASK_ID"]) 106 107 if not useFoamMPI and not foamVersion in eval(config().get("ClusterJob","useFoamMPI",default='[]')): 108 ## prepend special paths for the cluster 109 self.message("Adding Cluster-specific paths") 110 os.environ["PATH"]=config().get("ClusterJob","path")+":"+os.environ["PATH"] 111 os.environ["LD_LIBRARY_PATH"]=config().get("ClusterJob","ldpath")+":"+os.environ["LD_LIBRARY_PATH"] 112 113 self.isDecomposed=False
114
115 - def message(self,*txt):
116 print "=== CLUSTERJOB: ", 117 for t in txt: 118 print t, 119 print " ===" 120 sys.stdout.flush()
121
122 - def setState(self,txt):
123 self.message("Setting Job state to",txt) 124 fName=path.join(self.casedir(),"ClusterJobState") 125 f=open(fName,"w") 126 f.write(txt+"\n") 127 f.close()
128
129 - def jobFile(self):
130 """The file with the job information""" 131 jobfile="%s.%d" % (self.jobName,self.jobID) 132 if self.arrayJob: 133 jobfile+=".%d" % self.taskID 134 jobfile+=".pyFoam.clusterjob" 135 jobfile=path.join(path.dirname(self.basename),jobfile) 136 137 return jobfile
138
139 - def checkpointFile(self):
140 """The file that makes the job write a checkpoint""" 141 return self.jobFile()+".checkpoint"
142
143 - def stopFile(self):
144 """The file that makes the job write a checkpoint and end""" 145 return self.jobFile()+".stop"
146
147 - def doIt(self):
148 """The central logic. Runs the job, sets it up etc""" 149 150 f=open(self.jobFile(),"w") 151 f.write(path.basename(self.basename)+"\n") 152 f.close() 153 154 self.message() 155 self.message("Running on directory",self.casename()) 156 self.message() 157 self.setState("Starting up") 158 159 parameters=None 160 if self.arrayJob: 161 parameters=self.taskParameters(self.taskID) 162 self.message("Parameters:",parameters) 163 if not self.restarted: 164 self.setState("Setting up") 165 self.setup(parameters) 166 if self.autoParallel and self.nproc>1: 167 self.setState("Decomposing") 168 self.autoDecompose() 169 170 self.isDecomposed=True 171 172 self.setState("Setting up 2") 173 self.postDecomposeSetup(parameters) 174 else: 175 self.setState("Restarting") 176 177 self.isDecomposed=True 178 179 self.setState("Running") 180 self.listenToTimer=True 181 self.timer=Timer(1.,checkForMessageFromAbove,args=[self]) 182 self.timer.start() 183 184 self.run(parameters) 185 self.listenToTimer=False 186 187 if path.exists(self.jobFile()): 188 unlink(self.jobFile()) 189 190 if self.ordinaryEnd: 191 self.setState("Post Running") 192 self.preReconstructCleanup(parameters) 193 194 self.isDecomposed=False 195 196 if self.autoParallel and self.nproc>1: 197 self.setState("Reconstructing") 198 self.autoReconstruct() 199 200 if self.nproc>0: 201 self.additionalReconstruct(parameters) 202 203 self.setState("Cleaning") 204 self.cleanup(parameters) 205 self.setState("Finished") 206 else: 207 self.setState("Suspended") 208 209 if path.exists(self.stopFile()): 210 unlink(self.stopFile()) 211 if path.exists(self.checkpointFile()): 212 unlink(self.checkpointFile())
213
214 - def casedir(self):
215 """Returns the actual directory of the case 216 To be overridden if appropriate""" 217 if self.arrayJob: 218 return "%s.%05d" % (self.basename,self.taskID) 219 else: 220 return self.basename
221
222 - def casename(self):
223 """Returns just the name of the case""" 224 return path.basename(self.casedir())
225
226 - def foamRun(self,application, 227 args=[], 228 foamArgs=[], 229 steady=False, 230 multiRegion=None, 231 progress=False, 232 noLog=False):
233 """Runs a foam utility on the case. 234 If it is a parallel job and the grid has 235 already been decomposed (and not yet reconstructed) it is run in 236 parallel 237 @param application: the Foam-Application that is to be run 238 @param foamArgs: A list if with the additional arguments for the 239 Foam-Application 240 @param args: A list with additional arguments for the Runner-object 241 @param steady: Use the steady-runner 242 @param multiRegion: Run this on multiple regions (if None: I don't have an opinion on this) 243 @param progress: Only output the time and nothing else 244 @param noLog: Do not generate a logfile""" 245 246 arglist=args[:] 247 if self.isDecomposed and self.nproc>1: 248 arglist+=["--procnr=%d" % self.nproc, 249 "--machinefile=%s" % self.hostfile] 250 if progress: 251 arglist+=["--progress"] 252 if noLog: 253 arglist+=["--no-log"] 254 255 if self.multiRegion: 256 if multiRegion==None or multiRegion==True: 257 arglist+=["--all-regions"] 258 elif multiRegion and not self.multiRegion: 259 warning("This is not a multi-region case, so trying to run stuff multi-region won't do any good") 260 261 if self.restarted: 262 arglist+=["--restart"] 263 264 arglist+=[application] 265 if oldApp(): 266 arglist+=[".",self.casename()] 267 else: 268 arglist+=["-case",self.casename()] 269 270 arglist+=foamArgs 271 272 self.message("Executing",arglist) 273 274 if steady: 275 self.message("Running Steady") 276 runner=SteadyRunner(args=arglist) 277 else: 278 runner=Runner(args=arglist)
279
280 - def autoDecompose(self):
281 """Automatically decomposes the grid with a metis-algorithm""" 282 283 if path.isdir(path.join(self.casedir(),"processor0")): 284 warning("A processor directory already exists. There might be a problem") 285 args=["--method=metis", 286 "--clear", 287 self.casename(), 288 self.nproc] 289 290 if self.multiRegion: 291 args.append("--all-regions") 292 293 deco=Decomposer(args=args)
294
295 - def autoReconstruct(self):
296 """Default reconstruction of a parallel run""" 297 298 self.foamRun("reconstructPar", 299 args=["--logname=ReconstructPar"])
300
301 - def setup(self,parameters):
302 """Set up the job. Called in the beginning if the 303 job has not been restarted 304 305 Usual tasks include grid conversion/setup, mesh decomposition etc 306 307 @param parameters: a dictionary with parameters""" 308 309 pass
310
311 - def postDecomposeSetup(self,parameters):
312 """Additional setup, to be executed when the grid is already decomposed 313 314 Usually for tasks that can be done on a decomposed grid 315 316 @param parameters: a dictionary with parameters""" 317 318 pass
319
320 - def run(self,parameters):
321 """Run the actual job. Usually the solver. 322 @param parameters: a dictionary with parameters""" 323 324 pass
325
326 - def preReconstructCleanup(self,parameters):
327 """Additional cleanup, to be executed when the grid is still decomposed 328 329 Usually for tasks that can be done on a decomposed grid 330 331 @param parameters: a dictionary with parameters""" 332 333 pass
334
335 - def cleanup(self,parameters):
336 """Clean up after a job 337 @param parameters: a dictionary with parameters""" 338 339 pass
340
341 - def additionalReconstruct(self,parameters):
342 """Additional reconstruction of parallel runs (Stuff that the 343 OpenFOAM-reconstructPar doesn't do 344 @param parameters: a dictionary with parameters""" 345 346 pass
347
348 - def taskParameters(self,id):
349 """Parameters for a specific task 350 @param id: the id of the task 351 @return: a dictionary with parameters for this task""" 352 353 error("taskParameter not implemented. Not a parameterized job") 354 355 return {}
356
357 - def writeCheckpoint(self):
358 if self.listenToTimer: 359 f=open(path.join(self.basename,"write"),"w") 360 f.write("Jetzt will ich's wissen") 361 f.close() 362 unlink(self.checkpointFile()) 363 else: 364 warning("I'm not listening to your callbacks") 365 366 self.timer=Timer(1.,checkForMessageFromAbove,args=[self])
367
368 - def stopJob(self):
369 if self.listenToTimer: 370 self.ordinaryEnd=False 371 f=open(path.join(self.basename,"stop"),"w") 372 f.write("Geh z'haus") 373 f.close() 374 unlink(self.stopFile()) 375 else: 376 warning("I'm not listening to your callbacks")
377
378 -class SolverJob(ClusterJob):
379 """A Cluster-Job that executes a solver. It implements the run-function. 380 If a template-case is specified, the case is copied""" 381
382 - def __init__(self,basename,solver, 383 template=None, 384 cloneParameters=[], 385 arrayJob=False, 386 hardRestart=False, 387 autoParallel=True, 388 foamVersion=None, 389 useFoamMPI=False, 390 steady=False, 391 multiRegion=False, 392 progress=False, 393 solverProgress=False, 394 solverNoLog=False):
395 """@param template: Name of the template-case. It is assumed that 396 it resides in the same directory as the actual case 397 @param cloneParameters: a list with additional parameters for the 398 CloneCase-object that copies the template 399 @param solverProgress: Only writes the current time of the solver""" 400 401 ClusterJob.__init__(self,basename, 402 arrayJob=arrayJob, 403 hardRestart=hardRestart, 404 autoParallel=autoParallel, 405 foamVersion=foamVersion, 406 useFoamMPI=useFoamMPI, 407 multiRegion=multiRegion) 408 self.solver=solver 409 self.steady=steady 410 if template!=None and not self.restarted: 411 template=path.join(path.dirname(self.casedir()),template) 412 if path.abspath(basename)==path.abspath(template): 413 error("The basename",basename,"and the template",template,"are the same directory") 414 clone=CloneCase( 415 args=cloneParameters+[template,self.casedir(),"--follow-symlinks"]) 416 self.solverProgress=solverProgress 417 self.solverNoLog=solverNoLog
418
419 - def run(self,parameters):
420 self.foamRun(self.solver, 421 steady=self.steady, 422 multiRegion=False, 423 progress=self.solverProgress, 424 noLog=self.solverNoLog)
425