
#include <assert.h>

#include "nexus.h"

#include "globus_gram_client.h"

#include "globus_duroc_control.h"

#include "subjob.h"
#include "job_monitor.h"
#include "control.h"

#include "duroc-common.h"


#define RUN_MSG_ID 0
#define DIE_MSG_ID 1



#define s_monitor_init_cond_err(err) \
(globus_duroc_at_error ("cond init", err), GLOBUS_DUROC_ERROR_NEXUS_FAILED)
#define s_monitor_init_mutex_err(err) \
(globus_duroc_at_error ("mutex init", err), GLOBUS_DUROC_ERROR_NEXUS_FAILED)
#define s_monitor_init_duct_init_err(err) \
     (globus_duroc_at_error("globus_duct_init", err), GLOBUS_DUROC_ERROR_DUCT_FAILED)
#define s_monitor_init_serialno_err(err) \
(globus_duroc_at_error ("create job serial number", err), \
 GLOBUS_DUROC_ERROR_INTERNAL_FAILURE)
#define s_monitor_init_serialno_t_err(err) \
(globus_duroc_at_error ("hashtable_init", err), GLOBUS_DUROC_ERROR_INIT_FAILED)
#define s_monitor_init_label_t_err(err) \
(globus_duroc_at_error ("hashtable_init", err), GLOBUS_DUROC_ERROR_INIT_FAILED)
#define s_monitor_init_globus_gram_t_err(err) \
(globus_duroc_at_error ("hashtable_init", err), GLOBUS_DUROC_ERROR_INIT_FAILED)
#define s_monitor_init_link_monitor_err(err) \
(globus_duroc_at_error ("link job monitor", err), GLOBUS_DUROC_ERROR_INTERNAL_FAILURE)

/*
 * initialize monitor record
 * bind into server (with unique serial number)
 *
 * if this call succeeds the record will be scheduled
 * for reference-count garbage-collection.  
 * (*monitorp) MUST have been globus_malloc'd
 */
int
globus_duroc_control_i_job_monitor_init (globus_duroc_control_t     * controlp,
				  globus_duroc_job_monitor_t * job_monitorp)
{
  int err;

  if ( (controlp==NULL) || (job_monitorp==NULL) ) 
    return GLOBUS_DUROC_ERROR_INVALID_PARAMETER;

  
  job_monitorp->ref_count = 0;
  job_monitorp->next_free_serialno = 1;
  job_monitorp->subjobs = NULL;
  job_monitorp->release_barrier = GLOBUS_FALSE;
  job_monitorp->barrier_released = GLOBUS_FALSE;
  job_monitorp->job_canceled = GLOBUS_FALSE;


  err = nexus_mutex_init (&(job_monitorp->mutex), NULL);
  if (err) {
    err = s_monitor_init_mutex_err (err);
    goto monitor_init_mutex_error;
  }

  err = nexus_cond_init (&(job_monitorp->cond), NULL);
  if (err) {
    err = s_monitor_init_cond_err (err);
    goto monitor_init_cond_error;
  }

  err = globus_duct_control_init (&(job_monitorp->duct_control),
				0 /* supply size later */,
				NULL /* config callback unsupported */,
				NULL /* config callback userdata */);
  if (err) {
    err = s_monitor_init_duct_init_err (err);
    goto monitor_init_duct_init_error;
  }

  job_monitorp->serialno = globus_duroc_control_i_control_make_job_no (controlp);
  if (job_monitorp->serialno <= 0) {
    err = s_monitor_init_serialno_err (job_monitorp->serialno);
    goto monitor_init_serialno_error;
  }

  err = globus_hashtable_init (&(job_monitorp->subjob_serialno_hasht), 
			       16 /* nice default size */,
			       globus_hashtable_int_hash, 
			       globus_hashtable_int_keyeq);
  if (err) {
    err = s_monitor_init_serialno_t_err (err);
    goto monitor_init_serialno_t_init_error;
  }

  err = globus_hashtable_init (&(job_monitorp->subjob_label_hasht), 
			       16 /* nice default size */,
			       globus_hashtable_string_hash, 
			       globus_hashtable_string_keyeq);
  if (err) {
    err = s_monitor_init_label_t_err (err);
    goto monitor_init_label_t_init_error;
  }

  err = globus_hashtable_init (&(job_monitorp->subjob_globus_gram_hasht), 
			       16 /* nice default size */,
			       globus_hashtable_string_hash, 
			       globus_hashtable_string_keyeq);
  if (err) {
    err = s_monitor_init_globus_gram_t_err (err);
    goto monitor_init_globus_gram_t_init_error;
  }

  err = globus_duroc_control_i_control_link_job (controlp, job_monitorp);
  if (err) {
    err = s_monitor_init_link_monitor_err (err);
    goto monitor_init_link_monitor_error;
  }

  return GLOBUS_DUROC_SUCCESS;

  /* monitor_init_error_clauses:

  globus_duroc_control_i_control_unlink_job (controlp, job_monitorp); */
 monitor_init_link_monitor_error:

  globus_hashtable_destroy (&(job_monitorp->subjob_globus_gram_hasht));
 monitor_init_globus_gram_t_init_error:

  globus_hashtable_destroy (&(job_monitorp->subjob_label_hasht));
 monitor_init_label_t_init_error:

  globus_hashtable_destroy (&(job_monitorp->subjob_serialno_hasht));
 monitor_init_serialno_t_init_error:

 monitor_init_serialno_error:

  globus_duct_control_destroy (&(job_monitorp->duct_control));
 monitor_init_duct_init_error:

  nexus_cond_destroy (&(job_monitorp->cond));
 monitor_init_cond_error:

  nexus_mutex_destroy (&(job_monitorp->mutex));
 monitor_init_mutex_error:

  return err;
}

int 
globus_duroc_control_i_job_monitor_make_subjob_no (globus_duroc_job_monitor_t *job_monitorp)
{
  int err;
  int serialno;

  err = nexus_mutex_lock (&(job_monitorp->mutex)); assert (!err);
  serialno = job_monitorp->next_free_serialno;
  job_monitorp->next_free_serialno += 1;
  err = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err);

  return serialno;
}

#define s_link_subjob_duplicate_label_err(err) \
(globus_duroc_at_error ("user supplied duplicate subjob label", err), \
/* GLOBUS_DUROC_ERROR_DUPLICATE_SUBJOB_LABEL */ \
 globus_error_put (globus_object_construct (GLOBUS_ERROR_TYPE_BAD_DATA)))

globus_result_t
globus_duroc_control_i_job_monitor_link_subjob (globus_duroc_job_monitor_t * job_monitorp,
					 globus_duroc_subjob_t      * subjobp,
					 const char          * label)
{
  globus_result_t res;
  int err, err2;
  int subjob_serialno;

  err = nexus_mutex_lock (&(subjobp->mutex)); assert (!err);
  subjob_serialno = subjobp->serialno;
  err = nexus_mutex_unlock (&(subjobp->mutex)); assert (!err);

  err = nexus_mutex_lock (&(job_monitorp->mutex)); assert (!err);

  utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
	       "job monitor link subjob: subjob <%d, %d>, label >>%s<<\n",
	       job_monitorp->serialno, subjob_serialno,
	       (label ? label : "(none)"));

  if ( label ) {
    void * datum;

    datum = globus_hashtable_lookup (&(job_monitorp->subjob_label_hasht),
				     (void *) subjobp->label);
    if ( datum != NULL ) {
      /* duplicate label! */
      utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
		   "job monitor link subjob: user error: duplicate "
		   "subjob label >>%s<< supplied!\n", 
		   label);

      res = s_link_subjob_duplicate_label_err (0);
      goto s_link_subjob_duplicate_label_error;
    }
  }

  err = globus_hashtable_insert (&(job_monitorp->subjob_serialno_hasht),
				 (void *) subjob_serialno,
				 (void *) subjobp);
  assert (!err);

  if ( label ) {
    err = globus_hashtable_insert (&(job_monitorp->subjob_label_hasht),
				   (void *) subjobp->label,
				   (void *) (long) subjob_serialno);
    assert (!err);
  }

  err = globus_list_insert (&(job_monitorp->subjobs),
			    (void *) subjobp);
  assert (!err);

  err = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err);

  return GLOBUS_SUCCESS;


s_link_subjob_duplicate_label_error:
  err2 = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err2);

  return res;
}

void
globus_duroc_control_i_job_monitor_unlink_subjob (globus_duroc_job_monitor_t *job_monitorp,
					   globus_duroc_subjob_t * subjobp)
{
  int err;
  int serialno;
  globus_list_t *node;
  globus_duroc_subjob_t * subjobp2;
  int subjob_serialno;
  char * subjob_label;

  err = nexus_mutex_lock (&(subjobp->mutex)); assert (!err);
  subjob_serialno = subjobp->serialno;
  if ( subjobp->label ) 
    subjob_label = utils_strdup (subjobp->label);
  else
    subjob_label = NULL;
  err = nexus_mutex_unlock (&(subjobp->mutex)); assert (!err);

  err = nexus_mutex_lock (&(job_monitorp->mutex)); assert (!err);

  utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
	       "job monitor unlink subjob: subjob <%d, %d>, label >>%s<<\n",
	       job_monitorp->serialno,
	       subjob_serialno,
	       (subjob_label ? subjob_label : "(null)"));
  
  subjobp2 = ((globus_duroc_subjob_t *)
	      globus_hashtable_remove (&(job_monitorp->subjob_serialno_hasht),
				       (void *) (long) subjob_serialno));
  if ( subjobp != subjobp2 ) 
    utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
		 "job monitor unlink subjobp %x != removed subjobp %x!!\n",
		 (int) (long) subjobp, (int) (long) subjobp2);
  assert (subjobp == subjobp2);

  if ( subjob_label ) {
    serialno = ((int) (long)
		globus_hashtable_remove (&(job_monitorp->subjob_label_hasht),
					 (void *) subjob_label));
    if ( serialno != subjob_serialno )
      utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
	   "job monitor unlink subjob no. %d != removed subjobno %x!!\n",
		   subjob_serialno, serialno);
    assert (serialno == subjob_serialno);
  }

  node = globus_list_search (job_monitorp->subjobs,
			     (void *) subjobp);
  subjobp2 = ((globus_duroc_subjob_t *)
	      globus_list_remove (&(job_monitorp->subjobs),
				  node));
  if ( subjobp != subjobp2 ) 
    utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
		 "job monitor unlink subjobp %x != removed subjobp %x!!\n",
		 (int) (long) subjobp, (int) (long) subjobp2);
  assert (subjobp == subjobp2);

  err = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err);
}

int
globus_duroc_control_i_job_monitor_link_gram (globus_duroc_job_monitor_t * job_monitorp,
				       const char * contact,
				       int serialno)
{
  int err;
  char *contact_copy;
  void *search_result;

  contact_copy = utils_strdup (contact);

  err = nexus_mutex_lock (&(job_monitorp->mutex)); assert (!err);

  search_result = globus_hashtable_lookup (&(job_monitorp->subjob_globus_gram_hasht),
					   (void *) contact_copy);
  assert( search_result == 0 );

  err = globus_hashtable_insert (&(job_monitorp->subjob_globus_gram_hasht),
				 (void *) contact_copy,
				 (void *) (long) serialno);
  assert (!err);

  utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
	       "\ninserted (>>%s<<, %x) into subjob_globus_gram_hasht\n",
	       contact_copy, serialno);

  err = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err);

  return GLOBUS_DUROC_SUCCESS;
}

void
globus_duroc_control_i_job_monitor_unlink_gram (globus_duroc_job_monitor_t *job_monitorp,
					 const char * contact)
{
  int err;
  int serialno;

  err = nexus_mutex_lock (&(job_monitorp->mutex)); assert (!err);

  serialno = ((int) (long) 
	      globus_hashtable_remove (&(job_monitorp->subjob_globus_gram_hasht),
				       (void *) contact));
  assert (serialno>0);

  err = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err);
}

/*
 * reverse the init operation
 * unbind from the control
 */
void
globus_duroc_control_i_job_monitor_destroy (globus_duroc_control_t     * controlp,
				     globus_duroc_job_monitor_t * job_monitorp)
{
  int err;

  if ( (controlp==NULL) 
       || (job_monitorp==NULL) ) return;

  globus_duroc_control_i_control_unlink_job (controlp, job_monitorp);

  err = nexus_mutex_lock (&(job_monitorp->mutex)); assert (!err);

  assert (job_monitorp->ref_count == 0);

  err = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err);

  assert (globus_list_empty(job_monitorp->subjobs));

  globus_hashtable_destroy (&(job_monitorp->subjob_label_hasht));
  globus_hashtable_destroy (&(job_monitorp->subjob_serialno_hasht));

  globus_duct_control_destroy (&(job_monitorp->duct_control));

  nexus_cond_destroy (&(job_monitorp->cond));

  nexus_mutex_destroy (&(job_monitorp->mutex));

  /* jobs are reference-count garbage-collected */
  globus_free (job_monitorp);

  return;
}

#define s_subjob_by_label_lock_err(err) \
(globus_duroc_at_error ("mutex lock", err), GLOBUS_DUROC_ERROR_NEXUS_FAILED)

/*
 * get subjob_t record for label
 * atomically increments record ref_count
 * on success and s_subjob_release must be called after use
 */
int 
globus_duroc_control_i_subjob_lookup_by_label (globus_duroc_job_monitor_t *job_monitorp,
					const char          *label,
					globus_duroc_subjob_t     **subjobpp)
{
  int err;
  int err2;
  int serialno;

  if ( (job_monitorp==NULL) || (label==NULL) || (subjobpp==NULL) ) 
    return GLOBUS_DUROC_ERROR_INVALID_PARAMETER;

  err = nexus_mutex_lock (&(job_monitorp->mutex));
  if (err) {
    err = s_subjob_by_label_lock_err (err);
    goto subjob_by_label_lock_error;
  }

  serialno = ((int) (long) 
	      globus_hashtable_lookup (&(job_monitorp->subjob_label_hasht),
				       (void *) label));

  err = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err);

  if ( serialno <= 0 ) {
    (*subjobpp) = NULL;
    err = GLOBUS_DUROC_ERROR_UNKNOWN_LABEL;
  }
  else {
    err = globus_duroc_control_i_subjob_lookup_by_serialno (job_monitorp, 
						     serialno, 
						     subjobpp);
  }

  return err;

  /* subjob_by_label_error_clauses:

  err2 = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err2); */
 subjob_by_label_lock_error:

  return err;
}

#define s_subjob_by_globus_gram_lock_err(err) \
(globus_duroc_at_error ("mutex lock", err), GLOBUS_DUROC_ERROR_NEXUS_FAILED)

/*
 * get subjob_t record for label
 * atomically increments record ref_count
 * on success and s_subjob_release must be called after use
 */
int 
globus_duroc_control_i_subjob_lookup_by_gram (globus_duroc_job_monitor_t *job_monitorp,
				       const char          *globus_gram_contact,
				       globus_duroc_subjob_t     **subjobpp)
{
  int err;
  int err2;
  int serialno;

  if ( (job_monitorp==NULL) || (globus_gram_contact==NULL) || (subjobpp==NULL) ) 
    return GLOBUS_DUROC_ERROR_INVALID_PARAMETER;

  err = nexus_mutex_lock (&(job_monitorp->mutex));
  if (err) {
    err = s_subjob_by_globus_gram_lock_err (err);
    goto subjob_by_globus_gram_lock_error;
  }

  serialno = ((int) (long) 
	      globus_hashtable_lookup (&(job_monitorp->subjob_globus_gram_hasht),
				       (void *) globus_gram_contact));

  err = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err);

  if ( serialno <= 0 ) {
    (*subjobpp) = NULL;
    err = GLOBUS_DUROC_SUCCESS;
  }
  else {
    err = globus_duroc_control_i_subjob_lookup_by_serialno (job_monitorp, 
						     serialno, 
						     subjobpp);
  }

  return err;

  /* subjob_by_globus_gram_error_clauses:

  err2 = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err2); */
 subjob_by_globus_gram_lock_error:

  return err;
}

#define s_subjob_by_serialno_lock_err(err) \
(globus_duroc_at_error ("mutex lock", err), GLOBUS_DUROC_ERROR_NEXUS_FAILED)
#define s_subjob_by_serialno_lock2_err(err) \
(globus_duroc_at_error ("mutex lock", err), GLOBUS_DUROC_ERROR_NEXUS_FAILED)

/*
 * get subjob_t record for label
 * atomically increments record ref_count
 * on success and s_subjob_release must be called after use
 */
int
globus_duroc_control_i_subjob_lookup_by_serialno (globus_duroc_job_monitor_t *job_monitorp,
					   int                  serialno,
					   globus_duroc_subjob_t     **subjobpp)
{
  int err;
  int err2;

  if ( (job_monitorp==NULL) || (serialno<=0) || (subjobpp==NULL) ) 
    return GLOBUS_DUROC_ERROR_INVALID_PARAMETER;

  err = nexus_mutex_lock (&(job_monitorp->mutex));
  if (err) {
    err = s_subjob_by_serialno_lock_err (err);
    goto subjob_by_serialno_lock_error;
  }

  (*subjobpp) = ((globus_duroc_subjob_t *) 
		 globus_hashtable_lookup (&(job_monitorp->
					    subjob_serialno_hasht),
					  (void *) (long) serialno));

  err = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err);

  if ( (*subjobpp) != NULL ) {
    err = nexus_mutex_lock (&((*subjobpp)->mutex));
    if (err) {
      err = s_subjob_by_serialno_lock2_err (err);
      goto subjob_by_serialno_lock2_error;
    }

    (*subjobpp)->ref_count += 1;

    err = nexus_mutex_unlock (&((*subjobpp)->mutex)); assert (!err);
  }

  return GLOBUS_DUROC_SUCCESS;

  /* subjob_by_serialno_error_clauses:

  err2 = nexus_mutex_unlock (&((*subjobpp)->mutex)); assert (!err2); */
 subjob_by_serialno_lock2_error:

  err2 = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err2);
 subjob_by_serialno_lock_error:

  return err;
}

void
globus_duroc_control_i_subjob_release (globus_duroc_control_t *controlp,
				globus_duroc_job_monitor_t *job_monitorp,
				globus_duroc_subjob_t **subjobpp)
{
  GLOBUS_IGNORE controlp;
  GLOBUS_IGNORE job_monitorp;
  GLOBUS_IGNORE subjobpp;
}

static globus_list_t *
globus_duroc_control_i_job_monitor_copy_subjobs (globus_duroc_job_monitor_t * job_monitorp)
{
  int err;
  globus_list_t * result;
  globus_list_t * result_iter;

  /* job_monitorp already locked */
  result = globus_list_copy (job_monitorp->subjobs);

  result_iter = result;

  while ( ! globus_list_empty (result_iter) ) {
    globus_duroc_subjob_t * subjobp;

    subjobp = ((globus_duroc_subjob_t *)
	       globus_list_first (result_iter));
    assert (subjobp!=NULL);

    err = nexus_mutex_lock (&(subjobp->mutex)); assert (!err);
    subjobp->ref_count += 1;
    err = nexus_mutex_unlock (&(subjobp->mutex)); assert (!err);

    result_iter = globus_list_rest (result_iter);
  }

  return result;
}

#define s_poll_run_err(err) \
(globus_duroc_at_error ("nexus_send_rsr (RUN command)", err), \
 0 /* error value ignored */)

void 
globus_duroc_control_i_job_monitor_poll (globus_duroc_control_t * controlp,
				  globus_duroc_job_monitor_t * job_monitorp)
{
  int err;

  utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
	       "\n"
	       "job poll beginning...\n");

  err = nexus_mutex_lock (&(job_monitorp->mutex)); assert (!err);

  if ( job_monitorp->release_barrier == GLOBUS_TRUE ) {
    if ( job_monitorp->barrier_released == GLOBUS_FALSE ) {
      globus_bool_t   must_wait = GLOBUS_FALSE;
      globus_bool_t   must_fail = GLOBUS_FALSE;
      globus_list_t * subjob_iter;
      
      /* make initial run/fail/wait decision pass over subjobs */
      {
	subjob_iter = job_monitorp->subjobs;

	while ( ! globus_list_empty (subjob_iter) ) {
	  globus_duroc_subjob_t * subjobp;
	  int              subjob_state;
	  globus_bool_t    subjob_checked_in;
	  globus_duroc_start_method_t subjob_start_type;
	  char           * subjob_label;

	  subjobp = ((globus_duroc_subjob_t *) globus_list_first (subjob_iter));

	  err = nexus_mutex_lock (&(subjobp->mutex)); assert (!err);

	  subjob_state = subjobp->state;
	  subjob_checked_in = subjobp->checked_in;
	  subjob_label = utils_strdup (subjobp->label);
	  subjob_start_type = subjobp->start_type;

	  err = nexus_mutex_unlock (&(subjobp->mutex)); assert (!err);

	  if ( ((subjob_state == GLOBUS_DUROC_SUBJOB_STATE_FAILED)
		|| (subjob_state == GLOBUS_DUROC_SUBJOB_STATE_DONE))
	       && (subjob_start_type == GLOBUS_DUROC_START_STRICT) ) {

	    utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
			 "      subjob >>%s<< in strict mode terminated "
			 "before barrier!\n",
			 (subjob_label ? subjob_label : "(none)"));

	    must_fail = GLOBUS_TRUE;
	  }
	  else if ( (subjob_checked_in == GLOBUS_FALSE)
		    && (subjob_start_type != GLOBUS_DUROC_START_NONE) ) {
	    utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
			 ((subjob_state==GLOBUS_DUROC_SUBJOB_STATE_PENDING)
			  ? ("      subjob >>%s<< not checked in "
			     "and not active\n")
			  : ("      subjob >>%s<< not checkin in "
			     "but active\n")),
			 (subjob_label ? subjob_label : "(none)"));

	    if ( job_monitorp->wait_for_checkins
		 == GLOBUS_TRUE ) {
	      must_wait = GLOBUS_TRUE;
	    }
	    else {
	      must_fail = GLOBUS_TRUE;
	    }
	  }
	  else /* job ready for barrier release */ {

	    if ( subjob_start_type != GLOBUS_DUROC_START_NONE ) {
	      utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
			   "      subjob >>%s<< checked in %s\n",
			   (subjob_label ? subjob_label : "(none)"),
			   ((subjob_state == GLOBUS_DUROC_SUBJOB_STATE_FAILED)
			    ? "(fail state ignored)"
			    : ((subjob_state == GLOBUS_DUROC_SUBJOB_STATE_DONE)
			       ? "(done state ignored)"
			       : "and active")));
	    }

	    /* do nothing more in this pass */
	  }

	  subjob_iter = globus_list_rest (subjob_iter);
	  globus_free (subjob_label);
	}

      }

      utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
		   "   job poll initial pass result: %s.\n",
		   ((must_fail==GLOBUS_TRUE) 
		    ? "must FAIL"
		    : ((must_wait==GLOBUS_TRUE)
		       ? "must WAIT"
		       : "may PROCEED")));
      
      /* the second (release) pass */
      if ( must_fail == GLOBUS_FALSE ) {
	if ( must_wait == GLOBUS_FALSE ) {
	  /* release the barrier */
	  globus_list_t * subjobs;

	  subjobs = globus_duroc_control_i_job_monitor_copy_subjobs (job_monitorp);
	  
	  job_monitorp->barrier_released = GLOBUS_TRUE;

	  err = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err);

	  while ( ! globus_list_empty (subjobs) ) {
	    nexus_buffer_t  send_buffer;
	    nexus_startpoint_t command_sp;
	    globus_duroc_subjob_t * subjobp;

	    subjobp = ((globus_duroc_subjob_t *)
		       globus_list_first (subjobs));

	    err = nexus_mutex_lock (&(subjobp->mutex)); assert (!err);

	    if ( subjobp->start_type != GLOBUS_DUROC_START_NONE ) {

	      utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
			   "releasing subjob >>%s<< barrier\n",
			   (subjobp->label ? subjobp->label : "(none)"));

	      /* overwrite failed/done states since we must be
	       * in LOOSE mode for them to occur here */
	      subjobp->state = GLOBUS_DUROC_SUBJOB_STATE_RELEASED;

	      err = nexus_startpoint_copy (&command_sp,
					   &(subjobp->command_sp));
	      assert (!err);

	      err = nexus_mutex_unlock (&(subjobp->mutex)); assert (!err);

	      err = nexus_buffer_init (&send_buffer, 0, 0); assert (!err);

	      err = nexus_send_rsr (&send_buffer, &command_sp,
				    RUN_MSG_ID,
				    NEXUS_TRUE /* destroy buffer */,
				    NEXUS_TRUE /* always safe */);
	      if (err) {
		s_poll_run_err (err);
	      }
	    }
	    else {
	      err = nexus_mutex_unlock (&(subjobp->mutex)); assert (!err);
	    }

	    globus_duroc_control_i_subjob_release (controlp, job_monitorp, &subjobp);
	    globus_list_remove (&(subjobs), subjobs);
	  }
	}
	else /* must_wait == GLOBUS_TRUE */ {
	  /* do nothing in this poll op */
	  err = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err);
	}
      }
      else /* must_fail == GLOBUS_TRUE */ {
	/* fail out the job */
	char * contact;

	err = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err);

	err = globus_duroc_control_job_contact (controlp, job_monitorp, &contact);
	assert (!err);

	utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
		     "barrier failed, canceling job.\n");

	globus_duroc_control_job_cancel (controlp, contact);
      }
    }
    else {
      err = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err);
    }

    err = nexus_mutex_lock (&(job_monitorp->mutex)); assert (!err);

    if ( job_monitorp->barrier_released == GLOBUS_TRUE ) {
      globus_list_t * subjob_iter;

      utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
		   "subjob barriers have been released.\n");
      
      /* make job state summary pass */
      {
	globus_bool_t    subjobs_running;

	subjobs_running = GLOBUS_FALSE;

	subjob_iter = job_monitorp->subjobs;

	while ( ! globus_list_empty (subjob_iter) ) {
	  globus_duroc_subjob_t * subjobp;
	  int              subjob_state;
	  char           * subjob_label;

	  subjobp = ((globus_duroc_subjob_t *) globus_list_first (subjob_iter));

	  err = nexus_mutex_lock (&(subjobp->mutex)); assert (!err);

	  subjob_state = subjobp->state;
	  subjob_label = utils_strdup (subjobp->label);

	  err = nexus_mutex_unlock (&(subjobp->mutex)); assert (!err);

	  if ( (subjob_state == GLOBUS_DUROC_SUBJOB_STATE_FAILED)
	       || (subjob_state == GLOBUS_DUROC_SUBJOB_STATE_DONE) ) {

	    utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
			 "      subjob >>%s<< terminated (%s)\n",
			 (subjob_label ? subjob_label : "(none)"),
			 ((subjob_state == GLOBUS_DUROC_SUBJOB_STATE_DONE)
			  ? "done"
			  : "failed"));
	  }
	  else /* job released and not terminated */ {
	    utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
			 "      subjob >>%s<< not terminated\n",
			 (subjob_label ? subjob_label : "(none)"));

	    subjobs_running = GLOBUS_TRUE;
	  }

	  subjob_iter = globus_list_rest (subjob_iter);
	  globus_free (subjob_label);
	}

	if ( subjobs_running == GLOBUS_FALSE ) 
	  utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
		       "this job has completely terminated.\n");
      }
    }

    err = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err);
  }
  else {
    utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
		 "job barrier not yet released by user.\n");

    err = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err);
  }

  utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
	       "job poll ending.\n\n");

}

#define s_barrier_release_repeated_err(err) \
(globus_duroc_at_error ("repeat barrier release", err), \
 GLOBUS_DUROC_ERROR_ALREADY_RELEASED)
#define s_barrier_release_canceled_err(err) \
(globus_duroc_at_error ("release canceled job", err), \
 GLOBUS_DUROC_ERROR_ALREADY_CANCELLED)

int
globus_duroc_control_i_job_monitor_barrier_release (
			     globus_duroc_control_t     * controlp,
			     globus_duroc_job_monitor_t * job_monitorp,
			     globus_bool_t         wait_for_checkins)
{
  int err;
  int err2;
  int barrier_count = 0;
  int duct_count = 0;
  globus_duct_control_t * duct_controlp;
  globus_list_t * subjob_iter;

  err = nexus_mutex_lock (&(job_monitorp->mutex)); assert (!err);
  
  if ( job_monitorp->release_barrier == GLOBUS_TRUE ) {
    err = s_barrier_release_repeated_err(0);
    goto barrier_release_repeated_error;
  }

  if ( job_monitorp->job_canceled == GLOBUS_TRUE ) {
    err = s_barrier_release_canceled_err(0);
    goto barrier_release_canceled_error;
  }

  job_monitorp->release_barrier = GLOBUS_TRUE;
  job_monitorp->wait_for_checkins = wait_for_checkins;

  duct_controlp = &(job_monitorp->duct_control);

  subjob_iter = job_monitorp->subjobs;
  while (! globus_list_empty (subjob_iter) ) {
    globus_duroc_subjob_t * subjobp;
    subjobp = ((globus_duroc_subjob_t *)
	       globus_list_first (subjob_iter));
    err = nexus_mutex_lock (&(subjobp->mutex)); assert (!err);
    if ( subjobp->start_type != GLOBUS_DUROC_START_NONE ) 
      barrier_count += 1;
    if ( subjobp->comms_type != GLOBUS_DUROC_COMMS_NONE )
      duct_count += 1;
    err = nexus_mutex_unlock (&(subjobp->mutex)); assert (!err);
    subjob_iter = globus_list_rest (subjob_iter);
  }

  err = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err);

  err = globus_duct_control_set_groupsize (duct_controlp, duct_count);

  globus_duroc_control_i_job_monitor_poll (controlp, job_monitorp);

  return GLOBUS_DUROC_SUCCESS;

 barrier_release_canceled_error:

 barrier_release_repeated_error:
  err2 = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err2);

  return err;
}

#define s_cancel_already_canceled_err(err) \
(globus_duroc_at_error ("job already canceled", 0), \
 GLOBUS_DUROC_ERROR_ALREADY_CANCELLED)

int
globus_duroc_control_i_job_cancel (globus_duroc_control_t * controlp,
			    globus_duroc_job_monitor_t * job_monitorp)
{
  int err;
  int err2;
  globus_list_t *subjobs;

  err = nexus_mutex_lock (&(job_monitorp->mutex)); assert (!err);

  if ( job_monitorp->job_canceled == GLOBUS_TRUE ) {
    err = s_cancel_already_canceled_err(0);
    goto cancel_already_canceled_error;
  }

  subjobs = subjobs = globus_duroc_control_i_job_monitor_copy_subjobs (job_monitorp);
 
  job_monitorp->job_canceled = GLOBUS_TRUE;

  err = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err);

  while ( ! globus_list_empty (subjobs) ) {
    globus_duroc_subjob_t * subjobp;

    subjobp = ((globus_duroc_subjob_t *)
	       globus_list_first (subjobs));

    err = nexus_mutex_lock (&(subjobp->mutex)); assert (!err);

    utils_debug (GLOBUS_DUROC_DEBUG_FLAG,
		 "\ncanceling subjob >>%s<<\n",
		 (subjobp->label
		  ? subjobp->label
		  : "(unlabeled)"));

    err = nexus_mutex_unlock (&(subjobp->mutex)); assert (!err);

    globus_duroc_control_i_subjob_kill (subjobp);

    globus_duroc_control_i_subjob_release (controlp, job_monitorp, &subjobp);
    globus_list_remove (&(subjobs), subjobs);
  }


  return GLOBUS_DUROC_SUCCESS;

 cancel_already_canceled_error:

  err2 = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err2);

  return err;
}

int
globus_duroc_control_i_subjob_states (globus_duroc_control_t * controlp,
			       globus_duroc_job_monitor_t * job_monitorp,
			       int    * subjob_countp,
			       int   ** subjob_statesp,
			       char *** subjob_labelsp)
{
  int err;
  int i;
  globus_list_t *subjobs_iter;

  GLOBUS_IGNORE controlp;

  if ( (subjob_countp==NULL) 
       || (subjob_statesp==NULL)
       || (subjob_labelsp==NULL) )
    return GLOBUS_DUROC_ERROR_INVALID_PARAMETER;

  err = nexus_mutex_lock (&(job_monitorp->mutex)); assert (!err);

  (*subjob_countp) = globus_list_size (job_monitorp->subjobs);

  if ( (*subjob_countp) > 0 ) {
    (*subjob_statesp) = globus_malloc (sizeof (int)
				       * (*subjob_countp));
    assert ((*subjob_statesp)!=NULL);

    (*subjob_labelsp) = globus_malloc (sizeof (char *)
				       * (*subjob_countp));
    assert ((*subjob_labelsp)!=NULL);
  }
  else {
    (*subjob_statesp) = GLOBUS_NULL;
    (*subjob_labelsp) = GLOBUS_NULL;
  }

  subjobs_iter = job_monitorp->subjobs;
  for (i=0; i<(*subjob_countp); i++) {
    globus_duroc_subjob_t * subjobp;

    assert (subjobs_iter != NULL);
    subjobp = ((globus_duroc_subjob_t *)
	       globus_list_first (subjobs_iter));
    assert (subjobp!=NULL);

    err = nexus_mutex_lock (&(subjobp->mutex)); assert (!err);

    (*subjob_statesp)[i] = subjobp->state;
    (*subjob_labelsp)[i] = utils_strdup (subjobp->label);
    
    err = nexus_mutex_unlock (&(subjobp->mutex)); assert (!err);

    subjobs_iter = globus_list_rest (subjobs_iter);
  }

  err = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err);

  return GLOBUS_DUROC_SUCCESS;
}

int
globus_duroc_control_i_subjob_contacts (globus_duroc_control_t * controlp,
			       globus_duroc_job_monitor_t * job_monitorp,
			       int    * subjob_countp,
			       char *** subjob_contactsp,
			       char *** subjob_labelsp)
{
  int err;
  int i;
  globus_list_t *subjobs_iter;

  GLOBUS_IGNORE controlp;

  if ( (subjob_countp==NULL) 
       || (subjob_contactsp==NULL) 
       || (subjob_labelsp==NULL) )
    return GLOBUS_DUROC_ERROR_INVALID_PARAMETER;

  err = nexus_mutex_lock (&(job_monitorp->mutex)); assert (!err);

  (*subjob_countp) = globus_list_size (job_monitorp->subjobs);

  if ( (*subjob_countp) > 0 ) {
    (*subjob_contactsp) = globus_malloc (sizeof (char *)
				       * (*subjob_countp));
    assert ((*subjob_contactsp)!=NULL);

    (*subjob_labelsp) = globus_malloc (sizeof (char *)
				       * (*subjob_countp));
    assert ((*subjob_labelsp)!=NULL);
  }
  else {
    (*subjob_contactsp) = GLOBUS_NULL;
    (*subjob_labelsp) = GLOBUS_NULL;
  }

  subjobs_iter = job_monitorp->subjobs;
  for (i=0; i<(*subjob_countp); i++) {
    globus_duroc_subjob_t * subjobp;

    assert (subjobs_iter != NULL);
    subjobp = ((globus_duroc_subjob_t *)
	       globus_list_first (subjobs_iter));
    assert (subjobp!=NULL);

    err = nexus_mutex_lock (&(subjobp->mutex)); assert (!err);

    (*subjob_contactsp)[i] = utils_strdup (subjobp->contact);
    (*subjob_labelsp)[i] = utils_strdup (subjobp->label);
    
    err = nexus_mutex_unlock (&(subjobp->mutex)); assert (!err);

    subjobs_iter = globus_list_rest (subjobs_iter);
  }

  err = nexus_mutex_unlock (&(job_monitorp->mutex)); assert (!err);

  return GLOBUS_DUROC_SUCCESS;
}
