*** torque-1.0.1p6/src/resmom/mom_main.c Tue Mar 30 04:29:17 2004 --- torque-1.0.1p6-tmpdir/src/resmom/mom_main.c Tue Mar 30 17:47:29 2004 *************** *** 188,193 **** --- 188,196 ---- extern void add_resc_def(char *,char *); + #ifdef TRANSIENT_TMPDIR + extern char tmpdir_basename[]; + #endif /* TRANSIENT_TMPDIR */ /* Local Data Items */ *************** *** 853,858 **** --- 856,871 ---- } /* END setserver() */ + #ifdef TRANSIENT_TMPDIR + static u_long + settmpdir(char *name) + { + static char id[] = "settmpdir"; + + strcpy(tmpdir_basename, name); + return 0; + } + #endif /* TRANSIENT_TMPDIR */ *************** *** 1289,1294 **** --- 1302,1310 ---- { "logevent", setlogevent }, { "max_load", setmaxload }, { "prologalarm", prologalarm }, + #ifdef TRANSIENT_TMPDIR + { "tmpdir", settmpdir }, + #endif /* TRANSIENT_TMPDIR */ { "restricted", restricted }, { "usecp", usecp }, { "wallmult", wallmult }, *************** *** 3667,3672 **** --- 3683,3692 ---- if (gethostname(ret_string,ret_size) == 0) addclient(ret_string); + #ifdef TRANSIENT_TMPDIR + settmpdir(TMP_DIR); + #endif /* TRANSIENT_TMPDIR */ + if (read_config(NULL)) { fprintf(stderr,"%s: config file '%s' failed\n", *** torque-1.0.1p6/src/resmom/start_exec.c Tue Mar 30 04:29:17 2004 --- torque-1.0.1p6-tmpdir/src/resmom/start_exec.c Tue Mar 30 17:45:03 2004 *************** *** 141,146 **** --- 141,150 ---- int mom_reader_go; /* see catchinter() & mom_writer() */ struct var_table vtable; /* for building up Job's environ */ + #ifdef TRANSIENT_TMPDIR + char tmpdir_basename[MAXPATHLEN]; + #endif /* TRANSIENT_TMPDIR */ + /* Local Varibles */ static int script_in; /* script file, will be stdin */ *************** *** 160,165 **** --- 164,172 ---- "PBS_NODENUM", "PBS_TASKNUM", "PBS_MOMPORT", + #ifdef TRANSIENT_TMPDIR + "TMPDIR", + #endif /* TRANSIENT_TMPDIR */ "PBS_NODEFILE" }; static char *variables_env[NUM_LCL_ENV_VAR]; *************** *** 622,627 **** --- 629,638 ---- struct array_strings *vstrs; struct stat sb; struct sockaddr_in saddr; + #ifdef TRANSIENT_TMPDIR + int rc; + char tmpdir[MAXPATHLEN]; + #endif if (pjob->ji_numnodes > 1) { *************** *** 1250,1256 **** --- 1261,1271 ---- path_home, pjob->ji_qs.ji_jobid); + #ifdef TRANSIENT_TMPDIR + bld_env_variables(&vtable,variables_else[12],buf); + #else bld_env_variables(&vtable,variables_else[11],buf); + #endif if ((nhow = fopen(buf,"w")) == NULL) { *************** *** 1290,1295 **** --- 1301,1363 ---- fclose(nhow); } + #ifdef TRANSIENT_TMPDIR + /* only when TMPDIR is not set by someone else */ + if ( getenv("TMPDIR") == NULL ) { + *buf=0; + snprintf(tmpdir, sizeof(tmpdir), "%s/%s", tmpdir_basename, pjob->ji_qs.ji_jobid); + rc=stat(tmpdir,&sb); + if (rc) rc=errno; + switch (rc) { + case ENOENT: + break; + case 0: + if( ! S_ISDIR(sb.st_mode)) { + sprintf(log_buffer, "Job transient tmpdir %s exists, but is not a directory", tmpdir); + log_err(errno, id, log_buffer); + starter_return(upfds, downfds, JOB_EXEC_FAIL1, &sjr); + } + break; + default: + sprintf(log_buffer, "Cannot name job tmp directory %s (on stat)", tmpdir); + log_err(errno, id, log_buffer); + starter_return(upfds, downfds, JOB_EXEC_FAIL1, &sjr); + break; + } + + /* only Mother Superior shoudl create the directory */ + if ( pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE ) { + if (mkdir(tmpdir,0755) == -1 ) { + sprintf(log_buffer, "cannot mkdir tmpdir %s", tmpdir); + log_err(errno, id, log_buffer); + starter_return(upfds, downfds, JOB_EXEC_FAIL1, &sjr); + } + if (chown(tmpdir,pjob->ji_qs.ji_un.ji_momt.ji_exuid,pjob->ji_qs.ji_un.ji_momt.ji_exgid) == -1 ) { + sprintf(log_buffer, "cannot chown tmpdir %s to %d:%d", tmpdir, + pjob->ji_qs.ji_un.ji_momt.ji_exuid, + pjob->ji_qs.ji_un.ji_momt.ji_exgid); + log_err(errno, id, log_buffer); + starter_return(upfds, downfds, JOB_EXEC_FAIL1, &sjr); + } + if (chmod(tmpdir,0755) == -1 ) { + sprintf(log_buffer, "cannot chmod tmpdir %s", tmpdir); + log_err(errno, id, log_buffer); + starter_return(upfds, downfds, JOB_EXEC_FAIL1, &sjr); + } + } + + sprintf(log_buffer,"using transient tmpdir %s",tmpdir); + log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, + pjob->ji_qs.ji_jobid, log_buffer); + + bld_env_variables(&vtable, variables_else[11], tmpdir); + } else { + *tmpdir=0; /* did not create a unique directory */ + bld_env_variables(&vtable, variables_else[11], getenv("TMPDIR")); + } + #endif /* TRANSIENT_TMPDIR */ + + /* specific system related variables */ j = set_mach_vars(pjob, &vtable); *** torque-1.0.1p6/src/resmom/catch_child.c Tue Mar 30 04:29:17 2004 --- torque-1.0.1p6-tmpdir/src/resmom/catch_child.c Tue Mar 30 17:48:39 2004 *************** *** 83,88 **** --- 83,91 ---- #include #include #include + #ifdef TRANSIENT_TMPDIR + #include + #endif /* TRANSIENT_TMPDIR */ #include #include #include "dis.h" *************** *** 123,128 **** --- 126,134 ---- extern struct connection svr_conn[]; extern int resc_access_perm; extern char *path_home; + #ifdef TRANSIENT_TMPDIR + extern char tmpdir_basename[]; + #endif /* TRANSIENT_TMPDIR */ extern int LOGLEVEL; *************** *** 300,305 **** --- 306,314 ---- char *svrport; char *cookie; unsigned int port; + #ifdef TRANSIENT_TMPDIR + char tmpdir[MAXPATHLEN]; + #endif /* TRANSIENT_TMPDIR */ u_long gettime A_((resource *pres)); u_long getsize A_((resource *pres)); task *task_find A_((job *pjob,tm_task_id taskid)); *************** *** 573,578 **** --- 582,605 ---- pjob->ji_flags &= ~MOM_HAS_NODEFILE; } + #ifdef TRANSIENT_TMPDIR + /* this is the set of condition for a transient tmpdir as used at + * creation time in start_exec.c + * we already know we are Mother Superior */ + if ( (getenv("TMPDIR") == NULL) && *tmpdir_basename ) { + snprintf(tmpdir, sizeof(tmpdir), "%s/%s", tmpdir_basename, pjob->ji_qs.ji_jobid); + sprintf(log_buffer,"Removing transient job directory %s",tmpdir); + log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, + pjob->ji_qs.ji_jobid, log_buffer); + + if ( rrmdir(tmpdir) != 0 ) { + (void)sprintf(log_buffer, + "recursive remove of job transient tmpdir %s failed", + tmpdir); + log_err(errno, "recursive (r)rmdir", log_buffer); + } + } + #endif /* TRANSIENT_TMPDIR */ /* Send the Job Obiturary Notice to the server */ *************** *** 609,614 **** --- 636,704 ---- + + #ifdef TRANSIENT_TMPDIR + /* this routine will recursively remove everything below */ + int rrmdir(path) + char *path; + { + + #define SUBPATHLENCHUNK 2048 + + int rc; + DIR *dir; + struct dirent *dent; + struct stat sb; + char *subpath,*p; + int subpathlen=0; + + if ( *path == (int)'0' ) return ENOENT; + if ( (dir=opendir(path)) == NULL ) return errno; + + if ( (subpath=malloc(SUBPATHLENCHUNK)) == NULL ) + return ENOMEM; + subpathlen=SUBPATHLENCHUNK; + + while ( dent=readdir(dir) ) { + /* save me and my parent; and myself from a loop */ + if( *(dent->d_name)=='.' && *(1+dent->d_name)==0 ) continue; + if( *(dent->d_name)=='.' && *(1+dent->d_name)=='.' && *(2+dent->d_name)==0 ) continue; + + if(2+strlen(path)+strlen(dent->d_name) > subpathlen ) { + if( (p=realloc(subpath,subpathlen+SUBPATHLENCHUNK)) == NULL ) { + free(subpath); + return ENOMEM; + } + subpath=p; + subpathlen+=SUBPATHLENCHUNK; + } + strcpy(subpath,path); + strcat(subpath,"/"); + strcat(subpath,dent->d_name); + + if ( (rc=lstat(subpath,&sb)) != 0 ) { + free(subpath); + return rc; + } + + if(S_ISDIR(sb.st_mode)) + rc=rrmdir(subpath); + else + rc=unlink(subpath); + + if(rc) { + free(subpath); + return rc; + } + } + closedir(dir); + free(subpath); + rc=rmdir(path); + return rc; + } + #endif /* TRANSIENT_TMPDIR */ + + /* * obit_reply - read and process the reply from the server acknowledging * the job obiturary notice. *** torque-1.0.1p6/Readme.tmpdir Thu Jan 1 01:00:00 1970 --- torque-1.0.1p6-tmpdir/Readme.tmpdir Tue Mar 30 17:00:47 2004 *************** *** 0 **** --- 1,95 ---- + + Readme.tmpdir - Enabling transient temporary directories for jobs + ------------------------------------------------------------------------------ + + The transient-tmpdir patch enables the per-job TMPDIR functionality + such as found in the CRAY systems and in PBS Pro version 5.2 and + above. This document described the way to enable this patch and how + to configure the behaviour of TMPDIR using the pbs_mom configuration + file. + Although the functionality is the same, the way this patch works and + is configured may be different from PBS Pro or CRAY. + + + Compilation + ----------- + + The transient-tmpdir patch is enabled by defining the TRANSIENT_TMPDIR + symbol when compiling pbs-mom (in src/resmom). When using + autoconf, define CFLAGS as follows + + CFLAGS="$CFLAGS -DTRANSIENT_TMPDIR=1" + export CFLAGS + + ./configure --set-tmpdir= [ other configure options ] + + and run "make" and "make install". The default for tmpdir is defined + by configure, and set to "/tmp" unless spoecified otherwise. + + + How it works + ------------ + + Transient job directories are only created when a non-null value + is set for TMP_DIR (either using configure or by using the + "$tmpdir " directive in mom's config file), and the TMPDIR + environment variable is not set in the pbs_environment file. + + Before the job prologue script (or the job itself) is started, + a temporary, per-job, directory name is generated based on the + value of TMP_DIR (as set in configure or using the $tmpdir directive + in Mom's configuration file) and the job ID. This name is + passed to the job in the $TMPDIR environment variable. + + The Mother-Superior (MS) responsible for the job will actually + create the directory; other mom's will only define the value for the + job and refrain from creating it, although they will ensure that + TMPDIR is pointing to a directory and not a (special) file. + + This directory is created before the prologue script is run, and + TMPDIR will be available to the prologue script as well. + + When the job finished (the job is sent to the Obituary), this + directory is removed by Mother Superior after the epilogue script + has been run and after the PBS_NODEFILE has been removed. + + + Disabling the transient-tmpdir functionality + -------------------------------------------- + + If your copy of (Scalable) OpenPBS has been built with transient-tmpdir + functionality, but you would like to disable the creation of transient + directories, you can do so by explicitly setting TMPDIR in + the pbs_environment file ("TMPDIR=/tmp"). If TMPDIR is set explicitly, + no directories will be created (or removed). + + + Caveats + ------- + + When jobs are using multiple nodes for execution, and the + transient-tmpdir functionality is used, the basename for the temporary + directories (TMP_DIR or the $tmpdir directive in the mom config file) + must be the same for all mom's, and must be pointing to a file system + shared by all nodes. + + + Changes + ------- + + The transient-tmpdir patch only affects the following files + + src/resmom/mom_main.c + src/resmom/start_exec.c + src/resmom/catch_child.c + + + Acknowledgements + ---------------- + + This patch has been provided by the Scientific Computing project + (DataGrid Team) of the Dutch National Institute for Nuclear and + High-Energy Physics (NIKHEF), Amsterdam, The Netherlands. + David Groep , 2003-11-19. + +