/******************************************************************************
Description:

  Protocol module for message passing systems, including:
	- MPL on the IBM SP1
	- INX on the Intel Paragon

CVS Information:

  $Source: /home/globdev/CVS/globus-current/Globus/Communication/nexus/libraries/nexus/pr_mp.c,v $
  $Date: 1999/09/24 05:09:20 $
  $Revision: 1.82 $
  $State: Exp $
  $Author: bresnaha $
******************************************************************************/

static char *rcsid = "$Header: /home/globdev/CVS/globus-current/Globus/Communication/nexus/libraries/nexus/pr_mp.c,v 1.82 1999/09/24 05:09:20 bresnaha Exp $";

/******************************************************************************
			     Include header files
******************************************************************************/
#include "internal.h"

#include "pr_mp.h"

#if    defined(GLOBUS_USING_THIS_MP_PROTO)

    /* SGI's MPI_Cancel broken previous to v3.0 */
    /* Causes MPI_Finalize to hang              */
/* define to send msg instead of cancelling */
/* any posted receives                      */

/*
#define BROKEN_MPI_CANCEL
*/

#ifdef NEXUS_CRITICAL_PATH_TIMER
#include "perf/utp/UTP.h"
globus_bool_t _nx_time_critical_path = NEXUS_FALSE;
globus_bool_t _nx_critical_path_timer_started = NEXUS_FALSE;
int _nx_critical_path_start_timer = 0;
int _nx_critical_path_stop_timer = 0;
#endif

/*
 * GLOBUS_L_NEXUS_MP_PROTOCOL_VERSION
 *
 * The version of this protocol module's wire protocol.  If you change
 * this module wire protocol, bump this version number.
 */
#define GLOBUS_L_NEXUS_MP_PROTOCOL_VERSION (0 + GLOBUS_I_NEXUS_BUFFER_VERSION)


/*
 * GLOBUS_L_NEXUS_MP_MI_PROTO_VERSION
 *
 * The version of this protocol module's mi_proto.  If you change
 * the contents of this module's mi_proto, bump this version number.
 */
#define GLOBUS_L_NEXUS_MP_MI_PROTO_VERSION 0

/*
 * Hash table size for the proto table
 */
#define PROTO_TABLE_SIZE 1021

/*
 * Thread is handler?
 *
 * Thread specific storage is used to keep track if the current
 * thread is a handler thread or not.
 */
#ifdef BUILD_LITE
#define _nx_set_i_am_mp_handler_thread() /* nop */
#define _nx_i_am_mp_handler_thread(Result) *(Result) = NEXUS_TRUE
#else
static globus_thread_key_t i_am_mp_handler_thread_key;
#define _nx_set_i_am_mp_handler_thread() \
    nexus_thread_setspecific(i_am_mp_handler_thread_key, (void *) 1)
#define _nx_i_am_mp_handler_thread(Result) \
    *(Result) = (globus_bool_t)nexus_thread_getspecific(i_am_mp_handler_thread_key)
#endif /* BUILD_LITE */


/*
 * Only one thread is allowed to be in the mp code (and thus
 * mucking with data structures) at a time.
 */
static nexus_mutex_t		mp_mutex;
static globus_bool_t		mp_done;
static globus_bool_t		handle_in_progress;
static globus_bool_t		using_handler_thread;
static globus_bool_t		handler_thread_done;
static nexus_mutex_t		handler_thread_done_mutex;
static nexus_cond_t		handler_thread_done_cond;

#define mp_enter() nexus_mutex_lock(&mp_mutex);
#define mp_exit()  nexus_mutex_unlock(&mp_mutex);


/*
 * The mi_proto for this protocol module carries a unique session
 * string with it.  This allows one MPP to distinguish itself
 * from another MPP of the same type.
 */
static char *	session_string;
static int	session_string_length;
    

/*
 * Other useful defines
 */
#define BLOCKING			NEXUS_TRUE
#define NON_BLOCKING			NEXUS_FALSE

#define BIG_MESSAGE_FLAG		NEXUS_DC_FORMAT_LAST + 1
#define CLOSE_HANDLER_FLAG		NEXUS_DC_FORMAT_LAST + 3


/*
 * Some forward typedef declarations...
 */
typedef struct _mp_buffer_t	mp_buffer_t;
typedef struct _mp_proto_t	mp_proto_t;


/*
 * Some useful queue macros
 */
#define Enqueue(Qhead, Qtail, Item) \
{ \
    if (Qhead) \
    { \
	(Qtail)->next = (Item); \
	(Qtail) = (Item); \
    } \
    else \
    { \
	(Qhead) = (Qtail) = (Item); \
    } \
}

#define Dequeue(Qhead, Qtail, Item) \
{ \
    (Item) = (Qhead); \
    (Qhead) = (Qhead)->next; \
}

#define QueueNotEmpty(Qhead)	(Qhead)

static unsigned long mp_default_storage_size;
#ifdef DONT_INCLUDE
/*
 * A default buffer storage free list, to avoid malloc calls on small
 * sends and receives.
 *
 * Access to the free list must be locked by mp_enter() and mp_exit().
 */
typedef struct _mp_storage_t
{
    struct _mp_storage_t *	next;
    char			storage[1];
} mp_storage_t;

static mp_storage_t *default_storage_free_list = (mp_storage_t *) NULL;

#define MallocMpDefaultStorage(Routine, Ptr) \
{ \
    mp_storage_t *__s; \
    NexusMalloc(Routine, __s, mp_storage_t *, \
		(sizeof(mp_storage_t)+mp_default_storage_size-1)); \
    (Ptr) = &(__s->storage[0]); \
}

#define GetMpDefaultStorage(Routine, Ptr) \
{ \
    if (default_storage_free_list) \
    { \
	Ptr = default_storage_free_list->storage; \
	default_storage_free_list = default_storage_free_list->next; \
    } \
    else \
    { \
	MallocMpDefaultStorage(Routine, Ptr); \
    } \
}

#define FreeMpDefaultStorage(Ptr) \
{ \
    mp_storage_t *__s; \
    __s = (mp_storage_t *) (((char *) (Ptr)) - sizeof(mp_storage_t *)); \
    __s->next = default_storage_free_list; \
    default_storage_free_list = __s; \
}

/*
 * A mp_buffer_t free list, to avoid malloc calls on the
 * main body of a message buffer.
 *
 * Access to the free list must be locked by mp_enter() and mp_exit().
 */
static mp_buffer_t *	buffer_free_list = (mp_buffer_t *) NULL;

#endif /* DONT_INCLUDE */


#ifdef NEXUS_CRITICAL_PATH_TIMER
#define GLOBUS_MP_START_CRITICAL_PATH_TIMER() \
{ \
    if (_nx_time_critical_path && handle_or_enqueue == HANDLE_MESSAGES) \
    { \
	_nx_critical_path_timer_started = NEXUS_TRUE; \
	UTP_start_timer(_nx_critical_path_start_timer); \
    } \
}
#else
#define GLOBUS_MP_START_CRITICAL_PATH_TIMER()
#endif

/*********************************************************************
 * 		Include protocol configuration
 *********************************************************************/

/*
 * this is a symbolic link to the appropriate protocol header file
 */

#include "pr_mp.h"


static globus_mp_destination_t		my_node;
static int				n_nodes;


/*
 * mp_proto_t
 *
 * This is an overload of nexus_proto_t.  It adds the
 * mp specific information to that structure.
 */
struct _mp_proto_t
{
    nexus_proto_type_t		type;	/* NEXUS_PROTO_TYPE_MP */
    nexus_proto_funcs_t *	funcs;
    int				version;
    unsigned long		direct_custom_min_size;
    unsigned long		direct_custom_max_size;
    unsigned long		direct_pointer_min_size;
    unsigned long		direct_pointer_max_size;
    globus_bool_t		can_use_iovec;
    unsigned long		reserved_header_size;
    globus_mp_destination_t	destination;
    int				reference_count;
};


/*********************************************************************
 * Incoming message handling
 *********************************************************************/

static
globus_mp_receive_status_t	receive_status;
static nexus_byte_t *		receive_buffer = (nexus_byte_t *) NULL;
static unsigned long		receive_buffer_size;
static unsigned long		receive_pending_big_messages = 0;
static globus_bool_t		receive_posted = NEXUS_FALSE;
static globus_mp_communicator_t	nexusl_pr_mp_communicator;

#define MPPostReceive(Func, Error)				\
{								\
    (Error) = GLOBUS_SUCCESS;					\
								\
    if (!receive_posted)					\
    {								\
        if (!receive_buffer)					\
        {							\
	    NexusMalloc(Func, receive_buffer, nexus_byte_t *,	\
			receive_buffer_size);			\
        }							\
	GLOBUS_MP_POST_RECEIVE(nexusl_pr_mp_communicator,	\
			       Func,				\
			       receive_buffer,			\
			       receive_buffer_size,		\
			       receive_status,			\
	                       (Error));			\
        receive_posted = NEXUS_TRUE;				\
    }								\
}

#ifdef BROKEN_MPI_CANCEL
#define MPReceiveCancel(Func, Error)				\
{								\
    int done;							\
    int size;							\
    int dummy = 0;						\
    globus_mp_send_status_t send_status;			\
								\
    (Error) = GLOBUS_SUCCESS;					\
								\
    if (receive_posted)						\
    {								\
        GLOBUS_MP_SEND(nexusl_pr_mp_communicator,		\
		       my_node,					\
		       &dummy,					\
		       sizeof(int),				\
		       send_status,				\
		       (Error));				\
	if ((Error) == GLOBUS_SUCCESS)				\
	{							\
	    do {						\
		GLOBUS_MP_SEND_STATUS(send_status, done);	\
	    } while (done == NEXUS_FALSE);			\
	    GLOBUS_MP_RECEIVE_WAIT(Func,			\
				   receive_status,		\
				   &size,			\
				   done,			\
				   (Error));			\
	    receive_posted = NEXUS_FALSE;			\
	}							\
	if (receive_buffer)					\
	{							\
	    NexusFree(receive_buffer);				\
	}							\
    }								\
}
#else  /* !BROKEN_MPI_CANCEL */
#define MPReceiveCancel(Func, Error)				\
{								\
    (Error) = GLOBUS_SUCCESS;					\
								\
    if (receive_posted)						\
    {								\
	GLOBUS_MP_RECEIVE_CANCEL(receive_status, (Error));	\
    }								\
    if (receive_buffer)						\
    {								\
	NexusFree(receive_buffer);				\
    }								\
}
#endif /* BROKEN_MPI_CANCEL */

/*********************************************************************
 * Outgoing message handling
 *********************************************************************/

static struct globus_nexus_buffer_s *	send_buffer;
static globus_mp_send_status_t		send_status;
static struct globus_nexus_buffer_s *	send_q_head;
static struct globus_nexus_buffer_s *	send_q_tail;
static nexus_byte_t		send_big_message_header_buf[64];


/*
 * Protocol table stuff.
 *
 * The protocol table is hashed on the destination. The table itself is an
 * array of header structures pointing to a linked list of buckets.
 *
 * This table is used to avoid creating multiple mp_proto_t
 * objects to the same context.  Multiple global pointers to the same
 * context share a mp_proto_t.
 */
typedef struct _proto_table_entry_t
{
    mp_proto_t *proto;
    struct _proto_table_entry_t *next;
} proto_table_entry_t;

struct _proto_table_entry_t	proto_table[PROTO_TABLE_SIZE];

static globus_callback_handle_t  globus_l_nexus_mp_callback_handle;

static void			proto_table_init(void);
static void			proto_table_insert(mp_proto_t *proto);
static mp_proto_t *
proto_table_lookup(
    globus_mp_destination_t *		dest);


/*
 * Various forward declarations of procedures
 */
static void		mp_init(globus_bool_t * add_to_my_mi_proto);
static void		mp_shutdown(void);
static globus_bool_t	mp_poll(globus_abstime_t *  time_stop,
				void *              user_args);

static int		mp_send_rsr(struct globus_nexus_buffer_s *buffer);
static globus_bool_t	mp_send_rsr_outstanding(globus_nexus_proto_t *nproto);
static void             mp_increment_reference_count(nexus_proto_t *nproto);
static globus_bool_t	mp_decrement_reference_count(nexus_proto_t *nproto);
static int		mp_get_my_mi_proto(nexus_byte_t **array,
					   int *size,
					   void *proto_info,
					   nexus_endpoint_t *endpoint);
static globus_bool_t	mp_construct_from_mi_proto(nexus_proto_t **proto,
						   nexus_mi_proto_t *mi_proto,
						   nexus_byte_t *array,
						   int size);
static int		mp_direct_info_size(void);

static void		mp_start_send();
static void		mp_check_outstanding_send();

static mp_proto_t *	construct_proto(globus_mp_destination_t destination);
static void		free_proto(mp_proto_t *proto);

static globus_bool_t	receive_messages(globus_abstime_t * time_stop);


#endif /* GLOBUS_USING_THIS_MP_PROTO */

#include "pr_mp.h"
#define GLOBUS_L_MP_PROTO_COUNT            1

static nexus_proto_type_t	GLOBUS_NEXUS_MP_PROTOCOL_TYPE(void);

static globus_bool_t    GLOBUS_NEXUS_MP_PROTOCOL_SP_MATCH(
				      globus_nexus_mi_proto_t *   mi_proto0,
				      int                         offset0,
				      globus_byte_t *             subarray0,
				      int                         sub_length0,
				      globus_nexus_mi_proto_t *   mi_proto1,
				      int                         offset1,
			              globus_byte_t *             subarray1,
				      int                         sub_length1);

static int              GLOBUS_NEXUS_MP_PROTOCOL_COUNT(void);


#if defined(GLOBUS_USING_THIS_MP_PROTO)

static nexus_proto_funcs_t mp_proto_funcs =
{
    GLOBUS_NEXUS_MP_PROTOCOL_TYPE,
    mp_init,
    mp_shutdown,
    mp_increment_reference_count,
    mp_decrement_reference_count,
    mp_get_my_mi_proto,
    mp_construct_from_mi_proto,
    NULL /* mp_destroy_my_mi_proto */,
    NULL /* mp_test_proto */,
    mp_send_rsr,
    mp_send_rsr_outstanding,
    mp_direct_info_size,
    NULL /* mp_direct_get */,
    GLOBUS_NEXUS_MP_PROTOCOL_SP_MATCH,
    GLOBUS_NEXUS_MP_PROTOCOL_COUNT,
};

#else

static nexus_proto_funcs_t mp_proto_funcs =
{
    GLOBUS_NEXUS_MP_PROTOCOL_TYPE,
    GLOBUS_NULL,
    GLOBUS_NULL,
    GLOBUS_NULL,
    GLOBUS_NULL,
    GLOBUS_NULL,
    GLOBUS_NULL,
    GLOBUS_NULL,
    GLOBUS_NULL,
    GLOBUS_NULL,
    GLOBUS_NULL,
    GLOBUS_NULL,
    GLOBUS_NULL,
    GLOBUS_NEXUS_MP_PROTOCOL_SP_MATCH,
    GLOBUS_NEXUS_MP_PROTOCOL_COUNT,
};

#endif /* GLOBUS_USING_THIS_MP_PROTO */

/* start build all */
static int
GLOBUS_NEXUS_MP_PROTOCOL_SP_MATCH(
		    globus_nexus_mi_proto_t *   mi_proto0,
		    int                         offset0,
		    globus_byte_t *             subarray0,
		    int                         sub_length0,
	            globus_nexus_mi_proto_t *   mi_proto1,
		    int                         offset1,
	            globus_byte_t *             subarray1,
		    int                         sub_length1)
{
    if (strcmp((char *) (subarray0 + 1), (char *)(subarray1 + 1)) != 0 ||
	subarray0[0] != subarray1[0])
    {
        return GLOBUS_FALSE;
    }

    return GLOBUS_TRUE;
}


static int
GLOBUS_NEXUS_MP_PROTOCOL_COUNT(void)
{
    return GLOBUS_L_MP_PROTO_COUNT;
}

/*
 * _nx_pr_*_info()
 *
 * Return the nexus_proto_funcs_t function table for this protocol module.
 *
 * This procedure is used for bootstrapping the protocol module.
 * The higher level Nexus code needs to call this routine to
 * retrieve the functions it needs to use this protocol module.
 */
void *GLOBUS_NEXUS_MP_PROTOCOL_INFO(void)
{
    return((void *) (&mp_proto_funcs));
} /* _nx_pr_*_info() */


/*
 * mp_proto_type()
 *
 * Return the nexus_proto_type_t for this protocol module.
 */
static nexus_proto_type_t GLOBUS_NEXUS_MP_PROTOCOL_TYPE(void)
{
    return (GLOBUS_NEXUS_PROTO_TYPE_MP);
} /* mp_proto_type() */


#if defined(GLOBUS_USING_THIS_MP_PROTO)

#ifdef GLOBUS_MP_PROTO_IS_THREAD_SAFE    
/*
 * mp_handler_thread()
 *
 * In the multi-threaded version, this is the entry point
 * for the handler thread.
 */
static void *mp_handler_thread(
    globus_abstime_t *        time_stop,
    void *                    user_args)
{
    _nx_set_i_am_mp_handler_thread();
    
    mp_enter();
    receive_messages(time_stop);
    mp_exit();

    nexus_mutex_lock(&handler_thread_done_mutex);
    handler_thread_done = NEXUS_TRUE;
    nexus_cond_signal(&handler_thread_done_cond);
    nexus_mutex_unlock(&handler_thread_done_mutex);
    
    return ((void *) NULL);
} /* mp_handler_thread() */
#endif /* GLOBUS_MP_PROTO_IS_THREAD_SAFE */


/*
 * mp_init()
 *
 * Initialize the MP protocol.
 */
static void mp_init(globus_bool_t * add_to_my_mi_proto)
{
    char *                    arg;
    char *                    size_string;
    int                       error;
    globus_reltime_t          delay_time;

    /* Get the default storage size */
    /*
     * This doesn't work.  mp_buf needs to propogate to other nodes
     */
    /*
    if ((arg = globus_nexus_option_find("mp_buf")) != GLOBUS_NULL)
    {
	mp_default_storage_size = MAX(atoi(arg), GLOBUS_MP_MIN_STORAGE_SIZE);
    }
    else
    */
    {
	mp_default_storage_size = GLOBUS_MP_DEFAULT_STORAGE_SIZE;
    }

    GLOBUS_MP_INITIALIZE();
    GLOBUS_MP_INIT_NODE_INFO(my_node, n_nodes);
    GLOBUS_MP_COMMUNICATOR_ALLOC(nexusl_pr_mp_communicator);


    /*
     * Setup the session_string
     *
     * The master node comes up with a unique session string, and
     * then sends that string to the other nodes.
     */
    if (my_node == 0)
    {
	int			i;
	globus_mp_destination_t	dest;
	globus_mp_send_status_t	status;
	globus_bool_t		done;
	int			msglen;
	
	session_string = globus_get_unique_session_string();
	session_string_length = strlen(session_string) + 1;
	msglen = session_string_length;
	
	for (i = 1; i < n_nodes; i++)
	{
	    GLOBUS_MP_INIT_DESTINATION(dest);
	    GLOBUS_MP_SET_DESTINATION(dest, i);
	    GLOBUS_MP_SEND(nexusl_pr_mp_communicator,
			   dest,
			   session_string,
			   msglen,
			   status,
			   error);
	    if (error != GLOBUS_SUCCESS)
	    {
		globus_fatal("Message passing library failed!!!\n");
	    }
	    
	    do
	    {
		GLOBUS_MP_SEND_STATUS(status, done, error);
		if (error != GLOBUS_SUCCESS)
		{
		    globus_fatal("Message passing library failed!!!\n");
		}
	    }
	    while (done == NEXUS_FALSE);

	    GLOBUS_MP_FREE_DESTINATION(dest);
	}
    }
    else
    {
	globus_mp_receive_status_t	status;
	globus_bool_t		done;
	int			msglen;

	NexusMalloc(mp_init(),
		    session_string,
		    char *,
		    NEXUS_MAX_SESSION_STRING_LENGTH);
	GLOBUS_MP_POST_RECEIVE(nexusl_pr_mp_communicator,
			       mp_init(),
			       session_string,
			       NEXUS_MAX_SESSION_STRING_LENGTH,
			       status,
			       error);
	if (error != GLOBUS_SUCCESS)
	{
	    globus_fatal("Message passing library failed!!!\n");
	}

	GLOBUS_MP_RECEIVE_WAIT(mp_init(),
			       status,
			       &msglen,
			       done,
			       error);
	if (error != GLOBUS_SUCCESS)
	{
	    globus_fatal("Message passing library failed!!!\n");
	}
	session_string_length = strlen(session_string) + 1;
    }
    
#ifndef BUILD_LITE
    nexus_thread_key_create(&i_am_mp_handler_thread_key, NULL);
#endif
    proto_table_init();
    nexus_mutex_init(&mp_mutex, (nexus_mutexattr_t *) NULL);
    mp_done = NEXUS_FALSE;
    handle_in_progress = NEXUS_FALSE;

    send_buffer = (struct globus_nexus_buffer_s *) NULL;
    send_q_head = (struct globus_nexus_buffer_s *) NULL;
    send_q_tail = (struct globus_nexus_buffer_s *) NULL;

    receive_buffer_size = mp_default_storage_size;
    MPPostReceive(mp_init(), error);
    if (error != GLOBUS_SUCCESS)
    {
	globus_fatal("Message passing library failed!!!\n");
    }
    
    GlobusTimeReltimeSet(delay_time, 0, 0);
#ifdef GLOBUS_MP_PROTO_IS_THREAD_SAFE    
    if (nexus_preemptive_threads())
    {
	nexus_thread_t thread;

	using_handler_thread = NEXUS_TRUE;

	/* Create the handler thread */
	handler_thread_done = NEXUS_FALSE;
	nexus_mutex_init(&handler_thread_done_mutex,
			 (nexus_mutexattr_t *) NULL);
	nexus_cond_init(&handler_thread_done_cond,
			(nexus_condattr_t *) NULL);

	globus_callback_register_oneshot(GLOBUS_NULL,
					 &delay_time,,
					 mp_handler_thread,
					 GLOBUS_NULL,
					 GLOBUS_NULL,
					 GLOBUS_NULL);
    }
    else
#endif /* GLOBUS_MP_PROTO_IS_THREAD_SAFE */
    {
	using_handler_thread = NEXUS_FALSE;

	globus_callback_register_periodic(&globus_l_nexus_mp_callback_handle,
					  &delay_time,
					  &delay_time,
					  mp_poll,
					  GLOBUS_NULL,
					  GLOBUS_NULL,
					  GLOBUS_NULL);
    }

    *add_to_my_mi_proto = NEXUS_TRUE;
} /* mp_init() */


/*
 * mp_shutdown()
 *
 * This routine is called during normal shutdown of a process.
 */
static void mp_shutdown(void)
{
    int i;
    mp_proto_t *proto;
    globus_bool_t i_am_mp_handler_thread;
    int error;

    mp_enter();
    mp_done = NEXUS_TRUE;

    if (using_handler_thread)
    {
	_nx_i_am_mp_handler_thread(&i_am_mp_handler_thread);
	if (!i_am_mp_handler_thread)
	{
	    /*
	     * If this is not the mp handler thread, then we need
	     * to get the handler thread to shutdown.
	     *
	     * Since there other thread may be sitting in a blocking
	     * receive, we need to send a message to myself
	     * to wake up the handler thread.  Otherwise
	     * the handler will not notice the mp_done flag is set.
	     */
	    nexus_byte_t buf[1];
	    unsigned long buf_size;
	    globus_mp_send_status_t status;

	    /* Send message to myself*/
	    buf_size = 1;
	    buf[0] = CLOSE_HANDLER_FLAG;
	    GLOBUS_MP_SEND(nexusl_pr_mp_communicator,
			   my_node,
			   buf,
			   buf_size,
			   status,
			   error);
	    if (error != GLOBUS_SUCCESS)
	    {
		globus_fatal("Message passing library failed!!!\n");
	    }

	    /* Wait for the handler thread to shutdown */
	    mp_exit();
	    nexus_mutex_lock(&handler_thread_done_mutex);
	    while (!handler_thread_done)
	    {
		nexus_cond_wait(&handler_thread_done_cond,
				&handler_thread_done_mutex);
	    }
	    nexus_mutex_unlock(&handler_thread_done_mutex);
	    mp_enter();
	}
	nexus_mutex_destroy(&handler_thread_done_mutex);
	nexus_cond_destroy(&handler_thread_done_cond);
	using_handler_thread = NEXUS_FALSE;
    }
    else
    {
	globus_callback_unregister(globus_l_nexus_mp_callback_handle);
    }
    

    /*
     * Clear pending receives
     */
    MPReceiveCancel(mp_shutdown, error);
    if (error != GLOBUS_SUCCESS)
    {
	globus_fatal("Message passing library failed!!!\n");
    }

    /*
     * Call system routine to remove me from the system
     */
    GLOBUS_MP_COMMUNICATOR_FREE(nexusl_pr_mp_communicator);
    GLOBUS_MP_NODE_SHUTDOWN();
    
    mp_exit();

} /* mp_shutdown() */


/*
 * mp_poll()
 *
 * In a version of the mp protocol module that does not
 * use a handler thread (preemptive thread module & thread safe blocking
 * receives), this routine should check to see if there are
 * any messages to receive, and if so then receive them and invoke them.
 */
static globus_bool_t
mp_poll(
    globus_abstime_t *                  time_stop,
    void *                              user_args)
{
    globus_bool_t message_handled = NEXUS_FALSE;

    /*
     * This should not be called if a separate handler thread is in use.
     */
    NexusAssert2((!using_handler_thread),
		 ("mp_poll(): Internal error: "
		  "Should never be called when using a handler thread\n") );
    
    nexus_debug_printf(5, ("mp_poll(): entering\n"));
    
    mp_enter();
    {
	if (send_buffer)
	{
	    mp_check_outstanding_send();
	}

	if (!handle_in_progress)
	{
	    message_handled = receive_messages(time_stop);
	}
    }
    mp_exit();

    /*
     * Only yield the processor if there was a message handled.
     * That handler may have enabled another thread for execution.
     */
    if (message_handled)
    {
	globus_thread_yield();
    }
    
    nexus_debug_printf(5, ("mp_poll(): exiting\n"));

    return (message_handled);
} /* mp_poll() */


/*
 * mp_send_rsr()
 */
static int mp_send_rsr(struct globus_nexus_buffer_s *buffer)
{
    mp_proto_t *proto;
    nexus_byte_t *send_bytes;
    unsigned long send_size;

    nexus_debug_printf(2,("mp_send_rsr(): invoked with buffer: %x\n",buffer));

    if (buffer->n_direct > 0)
    {
	nexus_mutex_init(&(buffer->barrier.mutex),(nexus_mutexattr_t *) NULL);
	nexus_cond_init(&(buffer->barrier.cond), (nexus_condattr_t *) NULL);
	buffer->barrier.count = buffer->n_direct;
	buffer->using_barrier = NEXUS_TRUE;
    }

    mp_enter();
    
    if (send_buffer)
    {
	/*
	 * There is a send outstanding.
	 * So enqueue this message.
	 * It will be sent either now or later
	 * from mp_check_outstanding_send().
	 */
	Enqueue(send_q_head, send_q_tail, buffer);
	nexus_debug_printf(2,("mp_send_rsr(): before check\n"));
	mp_check_outstanding_send();
	nexus_debug_printf(2,("mp_send_rsr(): after check\n"));
    }
    else
    {
	/*
	 * There is no send outstanding.
	 * So starting send this buffer.
	 */
	nexus_debug_printf(2,("mp_send_rsr(): before mp_start_send()\n"));
	send_buffer = buffer;
	mp_start_send();
	nexus_debug_printf(2,("mp_send_rsr(): before mp_check_outstanding_send()\n"));
	mp_check_outstanding_send();
	nexus_debug_printf(2,("mp_send_rsr(): after mp_check_outstanding_send()\n"));
    }

    mp_exit();
    
    nexus_debug_printf(2,("mp_send_rsr(): exiting\n"));

    return(0);
    
} /* mp_send_rsr() */


/*
 * mp_send_rsr_outstanding()
 *
 * Return true if there are any sends outstanding for this proto,
 * otherwise false.
 */
static globus_bool_t
mp_send_rsr_outstanding(globus_nexus_proto_t *nproto)
{
    globus_bool_t rc = GLOBUS_FALSE;
    mp_enter();
    if (send_buffer)
    {
	rc = GLOBUS_TRUE;
    }
    mp_exit();
    return(rc);
} /* mp_send_rsr_outstanding() */


/*
 * mp_start_send()
 *
 * Start the send for the buffer in the 'send_buffer' global variable.
 *
 * If the message is bigger than the default storage, then
 * send the big message header message.  Otherwise, send
 * the base_segment of the buffer.
 *
 * Set the various 'send_*' global variables so that the send
 * can be continued by mp_check_outstanding_send().
 */
static void mp_start_send()
{
    mp_proto_t *proto;
    nexus_byte_t *send_bytes;
    unsigned long send_size;
    nexus_byte_t *b;
    int error;
    
#ifdef NEXUS_CRITICAL_PATH_TIMER
    if (_nx_critical_path_timer_started)
    {
	UTP_stop_timer(_nx_critical_path_stop_timer);
	_nx_critical_path_timer_started = NEXUS_FALSE;
    }
#endif

    proto = (mp_proto_t *) send_buffer->proto;
    send_size = send_buffer->base_segments->size_used;

    if (send_size > mp_default_storage_size)
    {
	b = send_big_message_header_buf;
	*b++ = GLOBUS_L_NEXUS_MP_PROTOCOL_VERSION;
	*b++ = BIG_MESSAGE_FLAG;
	*b++ = NEXUS_DC_FORMAT_LOCAL;
	nexus_dc_put_u_long(&b, &send_size, 1);
	send_bytes = send_big_message_header_buf;
	send_size = (b - send_big_message_header_buf);
    }
    else
    {
	send_bytes = send_buffer->base_segments->current;
	send_buffer->current_base_segment = (nexus_base_segment_t *) NULL;
    }
    
    GLOBUS_MP_SEND(nexusl_pr_mp_communicator,
		   proto->destination,
		   send_bytes,
		   send_size,
		   send_status,
		   error);
    if (error != GLOBUS_SUCCESS)
    {
	globus_fatal("Message passing library failed!!!\n");
    }
	
} /* mp_start_send() */


/*
 * mp_check_outstanding_send()
 *
 * Check the status of the current outstanding send.  If it is
 * complete, then post the next send if there is one.
 * The next send may be either a direct component of send_buffer,
 * or the next message in the send_q.
 *
 * Assumptions:
 *   - send_buffer != NULL
 *   - A send has already been posted for send_buffer
 *   - send_status is the status handle for the posted send
 *   - mp_enter() has already been called
 */
static void mp_check_outstanding_send()
{
    globus_bool_t done;
    globus_bool_t send_done;
    nexus_byte_t *send_bytes;
    unsigned long send_size;
    mp_proto_t *proto;
    int error;
    
    done = NEXUS_FALSE;
    while (!done)
    {
	GLOBUS_MP_SEND_STATUS(send_status, send_done, error);
	if (error != GLOBUS_SUCCESS)
	{
	    globus_fatal("Message passing library failed!!!\n");
	}
	if (send_done)
	{
	    /*
	     * The current outstanding send completed.
	     * So figure out what the next send is, and post it.
	     */

	    if (send_buffer->current_base_segment)
	    {
		/*
		 * The big message header message has been sent.
		 * So send the base_segment.
		 */
		proto = (mp_proto_t *) send_buffer->proto;
		send_size = send_buffer->base_segments->size_used;
		send_bytes = send_buffer->base_segments->current;
		send_buffer->current_base_segment
		    = (nexus_base_segment_t *) NULL;
    
		GLOBUS_MP_SEND(nexusl_pr_mp_communicator,
			       proto->destination,
			       send_bytes,
			       send_size,
			       send_status,
			       error);
		if (error != GLOBUS_SUCCESS)
		{
		    globus_fatal("Message passing library failed!!!\n");
		}
	    }
	    else if (   send_buffer->direct_segments
		     && (send_buffer->direct_segments->n_left > 0))
	    {
		/*
		 * The previous segment (base or direct) has been sent,
		 * and there is at least one more direct segment to send.
		 */
		nexus_direct_info_t *direct_info;
	
		nexus_debug_printf(1, ("mp_check_outstanding_send(): begin sending buffer direct segment 0x%lx\n", (unsigned long) send_buffer));

		direct_info = send_buffer->direct_segments->current;
		send_buffer->direct_segments->current++;
		send_buffer->direct_segments->n_left--;

		proto = (mp_proto_t *) send_buffer->proto;
		send_bytes = direct_info->data;
		send_size = direct_info->size;

		GLOBUS_MP_SEND(nexusl_pr_mp_communicator,
			       proto->destination,
			       send_bytes,
			       send_size,
			       send_status,
			       error);
		if (error != GLOBUS_SUCCESS)
		{
		    globus_fatal("Message passing library failed!!!\n");
		}
	    }
	    else
	    {
		/*
		 * We are done with this buffer.
		 * So free the buffer, and go on to the next one
		 * in the send_q.
		 */

		if (send_buffer->using_barrier)
		{
		    /*
		     * Signal the thread waiting on this buffer.
		     */
		    nexus_mutex_lock(&(send_buffer->barrier.mutex));
		    send_buffer->barrier.count--;
		    nexus_cond_signal(&(send_buffer->barrier.cond));
		    nexus_mutex_unlock(&(send_buffer->barrier.mutex));
		}
		else
		{
		    /*
		     * There is no thread waiting on this buffer.
		     * So destroy the buffer.
		     */
		    nexus_buffer_destroy(&send_buffer);
		}

		if (QueueNotEmpty(send_q_head))
		{
		    /* Grab the next buffer and start sending it */
		    Dequeue(send_q_head,
			    send_q_tail,
			    send_buffer);
		    mp_start_send();
		}
		else
		{
		    /* Nothing else waiting to be sent */
		    send_buffer = (struct globus_nexus_buffer_s *) NULL;
		    done = NEXUS_TRUE;
		}
		
	    }
	}
	else
	{
	    /*
	     * The current outstanding send is not yet complete.
	     * So break out of the loop and let this function return.
	     */
	    done = NEXUS_TRUE;
	}
    }
} /* mp_check_outstanding_send() */

     
/*
 * mp_increment_reference_count()
 *
 * Increase the reference count on the associated proto and copy the
 * pointer to the nexus_proto_t
 *
 */
static void mp_increment_reference_count(nexus_proto_t *nproto)
{
    mp_proto_t *proto = (mp_proto_t *) nproto;
    mp_enter();
    proto->reference_count++;
    mp_exit();
} /* mp_increment_reference_count() */


/*
 * mp_decrement_reference_count()
 *
 * Decrement the reference count for this proto.  If it goes to 0
 * then close the fd used by this proto.
 *
 * Return NEXUS_TRUE if this function frees the proto.
 */
static globus_bool_t mp_decrement_reference_count(nexus_proto_t *nproto)
{
    mp_proto_t *proto = (mp_proto_t *) nproto;
    mp_enter();
    proto->reference_count--;
    NexusAssert2((proto->reference_count >= 0),
		 ("mp_decrement_reference_count(): Internal error: Reference count < 0\n"));
    mp_exit();

    return(NEXUS_FALSE);
} /* mp_decrement_reference_count() */


/*
 * mp_get_my_mi_proto()
 *
 * Return the machine independent mp protocol information
 * for this protocol.
 */
static int mp_get_my_mi_proto(nexus_byte_t **array,
			      int *size,
			      void *proto_info,
			      nexus_endpoint_t *endpoint)
{
    int my_size;
    GLOBUS_MP_GET_MY_MI_PROTO_SIZE(my_size);
    *size = (1 + session_string_length + my_size);
    NexusMalloc(mp_get_my_mi_proto(),
		*array,
		nexus_byte_t *,
		*size);
    (*array)[0] = GLOBUS_L_NEXUS_MP_MI_PROTO_VERSION;
    memcpy((*array + 1),
	   session_string,
	   session_string_length);
    GLOBUS_MP_GET_MY_MI_PROTO(((*array) + 1 + session_string_length));

    return(0);
} /* mp_get_my_mi_proto() */


/*
 * mp_construct_from_mi_proto()
 *
 * From the passed machine independent protocol list ('mi_proto'), plus
 * the mp specific entry from that list ('proto_array' and 'size'),
 * see if I can use the information to create a nexus_proto_t object
 * that can be used to connect to the node:
 *	- If I cannot use this protocol to attach to the node, then
 *		return NEXUS_FALSE.  (This option is useful if two nodes
 *		both speak a particular protocol, but they cannot
 *		talk to each other via that protocol.  For example,
 *		on two MPP, the nodes within a single MPP can
 *		talk to each other via the native messaging protocol,
 *		but cannot talk to the nodes on the other MPP
 *		using that native protocol.)
 *	- If this mp protocol points to myself, then set
 *		*proto=NULL, and return NEXUS_TRUE.
 *	- Otherwise, construct a mp protocol object for this mi_proto
 *		and put it in *proto.  Then return NEXUS_TRUE.
 */
static globus_bool_t mp_construct_from_mi_proto(nexus_proto_t **proto,
					       nexus_mi_proto_t *mi_proto,
					       nexus_byte_t *array,
					       int size)
{
    globus_mp_destination_t destination;
    globus_bool_t result;
    int version;

    /*
     * Check the shm mi_proto version
     */
    version = (int) array[0];
    if (version != GLOBUS_L_NEXUS_MP_MI_PROTO_VERSION)
    {
	_nx_fault_detected(GLOBUS_NEXUS_ERROR_VERSION_MISMATCH);
	return(NEXUS_FALSE);
    }
    
    /*
     * Compare the session string from the array with mine.
     */
    if (strcmp((char *) (array + 1), session_string) != 0)
    {
	return(NEXUS_FALSE);
    }

    /*
     * Extract the mp_destination_t from the array
     */
    GLOBUS_MP_CONSTRUCT_FROM_MI_PROTO(destination,
			       mi_proto,
			       (array + 1 + session_string_length));

    /*
     * Test to see if this mi_proto points to myself.
     * If it does, then return *proto=NULL.
     */
    GLOBUS_MP_COMPARE_DESTINATIONS(destination, my_node, result);
    if (result)
    {
	*proto = (nexus_proto_t *) NULL;
    }
    else
    {
	mp_enter();
	*proto = (nexus_proto_t *) construct_proto(destination);
	mp_exit();
    }
    return (NEXUS_TRUE);
} /* mp_construct_from_mi_proto() */


/*
 * mp_direct_info_size()
 */
static int mp_direct_info_size(void)
{
    /* TODO: This needs to be filled in */
    return(0);
} /* mp_direct_info_size() */


/*
 * construct_proto()
 *
 * Construct a mp_proto_t for the given destination. Look up in the
 * proto table to see if one already exists. If it does, bump its reference
 * count and return that one. Otherwise create one, insert into the
 * table with a reference count of 1 and return it.
 */
static mp_proto_t *construct_proto(globus_mp_destination_t destination)
{
    mp_proto_t *proto;

    proto = proto_table_lookup(&destination);
    nexus_debug_printf(3,
		       ("construct_proto(): Table lookup returns proto=%x\n",
			proto));
    if (proto == (mp_proto_t *) NULL)
    {
	NexusMalloc(construct_proto(), proto, mp_proto_t *,
		    sizeof(mp_proto_t));

	proto->type = GLOBUS_NEXUS_PROTO_TYPE_MP;
	proto->funcs = &mp_proto_funcs;
	proto->version = GLOBUS_L_NEXUS_MP_PROTOCOL_VERSION;
	proto->direct_custom_min_size = GLOBUS_MP_BIG_MESSAGE_MIN_SIZE;
	proto->direct_custom_max_size = NEXUS_DC_MAX_U_LONG;
	proto->direct_pointer_min_size = NEXUS_DC_MAX_U_LONG;
	proto->direct_pointer_max_size = NEXUS_DC_MAX_U_LONG;
	proto->can_use_iovec = NEXUS_FALSE;
	proto->reserved_header_size = 0;
	proto->reference_count = 1;

	GLOBUS_MP_COPY_DESTINATION(proto->destination, destination);
	
	proto_table_insert(proto);
    }
    else
    {
	proto->reference_count++;
    }
	
    return (proto);
} /* construct_proto() */


/*
 * free_proto()
 *
 * Free the passed 'proto'.
 */
static void free_proto(mp_proto_t *proto)
{
    GLOBUS_MP_FREE_DESTINATION(proto->destination);

    NexusFree(proto);
} /* free_proto() */


/*
 * receive_messages();
 *
 * Receive all pending messages to this node.
 *
 * If blocking==BLOCKING, use a blocking receive.
 * If blocking==NON_BLOCKING, use a non-blocking receive.  Do not
 * return until the non-blocking receive fails.
 *
 * Note: If using_handler_thread==NEXUS_TRUE,
 *	 this routine will never be called by anyone except the
 *	 handler thread.  (Though it may be called recursively by the
 *	 the handler thread, if a send_rsr is done from within a handler.)
 *
 * Return: NEXUS_TRUE if a message is handled, otherwise NEXUS_FALSE
 */
static globus_bool_t
receive_messages(
    globus_abstime_t *                  time_stop)
{
    globus_bool_t done = NEXUS_FALSE;
    globus_bool_t message_received;
    globus_bool_t message_handled = NEXUS_FALSE;
    globus_bool_t need_to_relock;
    nexus_byte_t *b;
    nexus_byte_t *buf;
    unsigned long buf_size;
    unsigned long msg_size;
    int version;
    int format;
    struct globus_nexus_buffer_s *buffer;
    int error;
    
    nexus_debug_printf(5, ("receive_messages(): entering\n"));
    
    MPPostReceive(receive_messages(), error);
    if (error != GLOBUS_SUCCESS)
    {
	globus_fatal("Message passing library failed!!!\n");
    }
    
    do
    {
	if (globus_time_abstime_is_infinity((globus_abstime_t*)time_stop))
	{
	    int				msg_size;

	    if (using_handler_thread)
	    {
		/*
		 * This is the handler thread.  Since it is thread safe,
		 * we should do an mp_exit() so that other threads can
		 * do sends while this thread is blocked in the receive.
		 */
		need_to_relock = NEXUS_TRUE;
		mp_exit();
	    }
	    else
	    {
		need_to_relock = NEXUS_FALSE;
		done = NEXUS_TRUE; /* Do the blocking receive just once */
	    }
	    GLOBUS_MP_RECEIVE_WAIT(receive_messages(),
				   receive_status,
				   &msg_size,
				   message_received,
				   error);
	    if (error != GLOBUS_SUCCESS)
	    {
		globus_fatal("Message passing library failed!!!\n");
	    }
	    
	    if (need_to_relock)
	    {
		mp_enter();
	    }
	}
	else
	{
	    GLOBUS_MP_RECEIVE_STATUS(receive_messages(),
				     receive_status,
				     message_received,
				     error);
	    if (error != GLOBUS_SUCCESS)
	    {
		globus_fatal("Message passing library failed!!!\n");
	    }
	}
	
	if (mp_done)
	    return (message_handled);
	
	if (message_received)
	{
	    /*
	     * We received a message into 'buf'.
	     */
	    receive_posted = NEXUS_FALSE;
	    b = receive_buffer;
	    
	    version = (int) *b++;
	    if (version != GLOBUS_L_NEXUS_MP_PROTOCOL_VERSION)
	    {
		/*
		 * We got a Nexus buffer version mismatch with the
		 * sender of this message.  So don't post anymore
		 * receives, kick out a fault, and get out.
		 */
		mp_exit();
		if (_nx_fault_detected(GLOBUS_NEXUS_ERROR_VERSION_MISMATCH) !=0)
		{
		    globus_fatal("pr_mp.c:receive_messages(): Version mismatch\n");
		}
		mp_enter();
		goto abort;
	    }
	    
	    format = (int) *b++;
	    if (format < NEXUS_DC_FORMAT_LAST)
	    {
		/* We received a message */
		GLOBUS_MP_START_CRITICAL_PATH_TIMER();
		buf = receive_buffer;
		buf_size = receive_buffer_size;
		receive_buffer = (nexus_byte_t *) NULL;
		nexus_dc_get_u_long(&b, &msg_size, 1, format);
		if (msg_size > mp_default_storage_size)
		{
		    if (--receive_pending_big_messages == 0)
		    {
			receive_buffer_size = mp_default_storage_size;
		    }
		}
		
		/*
		 * Leaving this in will cause the next receive
		 * to be posted before this message is handled.
		MPPostReceive(receive_messages());
		*/

		handle_in_progress = NEXUS_TRUE;
		mp_exit();
		_nx_buffer_create_from_raw(buf,
					   buf_size,
					   0,
					   msg_size,
					   NULL,
					   &buffer);
		nexus_debug_printf(2,("pr_mp.c:receive_messages(): dispatching message\n"));
		_nx_buffer_dispatch(buffer);
		nexus_debug_printf(2,("pr_mp.c:receive_messages(): message dispatch complete\n"));
		mp_enter();
		handle_in_progress = NEXUS_FALSE;
		message_handled = NEXUS_TRUE;
		MPPostReceive(receive_messages(), error);
		if (error != GLOBUS_SUCCESS)
		{
		    globus_fatal("Message passing library failed!!!\n");
		}
	    }
	    else if (format == BIG_MESSAGE_FLAG)
	    {
		format = (int) *b++;
		nexus_dc_get_u_long(&b, &buf_size, 1, format);
		receive_pending_big_messages++;
		if (buf_size > receive_buffer_size)
		{
		    NexusFree(receive_buffer);
		    receive_buffer = (nexus_byte_t *) NULL;
		    receive_buffer_size = buf_size;
		}
		MPPostReceive(receive_messages(), error);
		if (error != GLOBUS_SUCCESS)
		{
		    globus_fatal("Message passing library failed!!!\n");
		}
	    }
	    else if (format == CLOSE_HANDLER_FLAG)
	    {
		/* Terminate the handler thread */
		done = NEXUS_TRUE;
	    }
	    else
	    {
		globus_fatal("receive_messages(): Got unknown control message\n");
	    }
	}
	else
	{
	    /* The (non-blocking) receive didn't receive anything, */
	    done = NEXUS_TRUE;
	    globus_thread_yield();
	}
    } while(/* !done && */!globus_callback_has_time_expired());

  abort:
    nexus_debug_printf(5,
		       ("receive_messages(): returning message_handled=%d\n",
			message_handled) );

    return (message_handled);
    
} /* receive_messages() */


/*
 * proto_table_init()
 *
 * Initialize the protocol table.
 */
static void proto_table_init(void)
{
    int i;

    for (i = 0; i < PROTO_TABLE_SIZE; i++)
    {
	proto_table[i].proto = (mp_proto_t *) NULL;
	proto_table[i].next = (proto_table_entry_t *) NULL;
    }
} /* proto_table_init() */


/*
 * proto_table_insert()
 *
 * Insert the given proto into the table, hashing on its destination.
 *
 * We assume that the entry is not present in the table.
 */
static void proto_table_insert(mp_proto_t *proto)
{
    int bucket;
    proto_table_entry_t *new_ent;

    GLOBUS_MP_HASH_DESTINATION(proto->destination, bucket);

    if (proto_table[bucket].proto == (mp_proto_t *) NULL)
    {
	/* Drop it into the preallocated table entry */
	proto_table[bucket].proto = proto;
    }
    else
    {
	/*
	 * Need to allocate a new proto_table_entry_t and add it
	 * to the bucket
	 */
	NexusMalloc(proto_table_insert(),
		    new_ent,
		    proto_table_entry_t *,
		    sizeof(struct _proto_table_entry_t));

	new_ent->proto = proto;
	new_ent->next = proto_table[bucket].next;

	proto_table[bucket].next = new_ent;
    }

} /* proto_table_insert() */


/*
 * proto_table_lookup()
 *
 * Look up and return the mp_proto_t for the given destination.
 * Return NULL if none exists.
 */
static mp_proto_t *proto_table_lookup(globus_mp_destination_t *dest)
{
    proto_table_entry_t *ent;
    int bucket;
    globus_bool_t result;

    GLOBUS_MP_HASH_DESTINATION(*dest, bucket);

    for (ent = &(proto_table[bucket]);
	 ent != (proto_table_entry_t *) NULL;
	 ent = ent->next)
    {
	if (ent->proto != (mp_proto_t *) NULL)
	{
	    GLOBUS_MP_COMPARE_DESTINATIONS(*dest,
					   ent->proto->destination,
					   result);
	    if (result)
	    {
		return (ent->proto);
	    }
	}
    }
    
    return ((mp_proto_t *) NULL);
} /* proto_table_lookup() */

#undef GLOBUS_USING_THIS_MP_PROTO

#endif /* GLOBUS_USING_THIS_MP_PROTO */
