/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#if !defined(MPIU_THREAD_POBJ_H_INCLUDED)
#define MPIU_THREAD_POBJ_H_INCLUDED

/* There are multiple locks, one for each (major) object */

/* MT FIXME the following description is almost right, but it needs minor
 * updates and revision to account for the COMPLETION CS and other issues in the
 * request */
/* The fine-grained locking discipline for requests is unfortunately complicated:
 *
 * (1) Raw allocation and deallocation of requests is protected internally by
 * the HANDLEALLOC critical section.  This is currently the same as the HANDLE
 * CS, not sure why we have both...
 *
 * (2) Once allocated, a directly allocated request is intially held exclusively
 * by a single thread.  Direct allocation is common for send requests, but recv
 * requests are usually created differently.
 *
 * (3) Most receive requests are created as the result of a call to FDP_or_AEU
 * or FDU_or_AEP.  Calls to these functions (along with the other receive queue
 * functions) must be inside a MSGQUEUE CS.  This CS protects the queue data
 * structures as well as any fields inside the requests while they are in the
 * queue.  For example, assume a call to FDU_or_AEP, as in MPID_Recv.  If the
 * FDU case hits, the MSGQUEUE CS may be released immediately after the call.
 * If the AEP case hits, however, the MSGQUEUE CS must remain held until any
 * request field manipulation (such as dev.recv_pending_count) is complete.
 *
 * (4) In both the send and receive request cases, there is usually a particular
 * thread in some upper-level code (e.g. MPI_Send) with interest in the
 * completion of the request.  This may or may not be a thread that is also
 * making progress on this request (often not).  The upper level code must not
 * attempt to access any request fields (such as the status) until completion is
 * signalled by the lower layer.
 *
 * (5) Once removed from the receive queue, the request is once again
 * exclusively owned by the dequeuing thread.  From here, the dequeuing thread
 * may do whatever it wants with the request without holding any CS, until it
 * signals the request's completion.  Signalling completion indicates that the
 * thread in the upper layer polling on it may access the rest of the fields in
 * the request.  This completion signalling is lock-free and must be implemented
 * carefully to work correctly in the face of optimizing compilers and CPUs.
 * The upper-level thread now wholly owns the request until it is deallocated.
 *
 * (6) In ch3:nemesis at least, multithreaded access to send requests is managed
 * by the MPIDCOMM (progress engine) CS.  The completion signalling pattern
 * applies here (think MPI_Isend/MPI_Wait).
 *
 * (7) Request cancellation is tricky-ish.  For send cancellation, it is
 * possible that the completion counter is actually *incremented* because a
 * pkt is sent to the recipient asking for remote cancellation.  By asking for
 * cancellation (of any kind of req), the upper layer gives up its exclusive
 * access to the request and must wait for the completion counter to drop to 0
 * before exclusively accessing the request fields.
 *
 * The completion counter is a reference count, much like the object liveness
 * reference count.  However it differs from a normal refcount because of
 * guarantees in the MPI Standard.  Applications must not attempt to complete
 * (wait/test/free) a given request concurrently in two separate threads.  So
 * checking for cc==0 is safe because only one thread is ever allowed to make
 * that check.
 *
 * A non-zero completion count must always be accompanied by a normal reference
 * that is logically held by the progress engine.  Similarly, once the
 * completion counter drops to zero, the progress engine is expected to release
 * its reference.
 */
/* lock ordering: if MPIDCOMM+MSGQUEUE must be aquired at the same time, then
 * the order should be to acquire MPIDCOMM first, then MSGQUEUE.  Release in
 * reverse order. */

/* POBJ locks are all real recursive ops */
#define MPIUI_THREAD_CS_ENTER_POBJ(mutex) MPIUI_THREAD_CS_ENTER_NONRECURSIVE("POBJ", mutex)
#define MPIUI_THREAD_CS_EXIT_POBJ(mutex) MPIUI_THREAD_CS_EXIT_NONRECURSIVE("POBJ", mutex)
#define MPIUI_THREAD_CS_YIELD_POBJ(mutex) MPIUI_THREAD_CS_YIELD_NONRECURSIVE("POBJ", mutex)

/* ALLGRAN locks are all real nonrecursive ops */
#define MPIUI_THREAD_CS_ENTER_ALLGRAN(mutex) MPIUI_THREAD_CS_ENTER_NONRECURSIVE("ALLGRAN", mutex)
#define MPIUI_THREAD_CS_EXIT_ALLGRAN(mutex) MPIUI_THREAD_CS_EXIT_NONRECURSIVE("ALLGRAN", mutex)
#define MPIUI_THREAD_CS_YIELD_ALLGRAN(mutex) MPIUI_THREAD_CS_YIELD_NONRECURSIVE("ALLGRAN", mutex)

/* GLOBAL locks are all NO-OPs */
#define MPIUI_THREAD_CS_ENTER_GLOBAL(mutex) do {} while (0)
#define MPIUI_THREAD_CS_EXIT_GLOBAL(mutex) do {} while (0)
#define MPIUI_THREAD_CS_YIELD_GLOBAL(mutex) do {} while (0)

/* define a type for the completion counter */
#include "opa_primitives.h"

typedef OPA_int_t MPIU_cc_t;

/* implies no barrier, since this routine should only be used for request
 * initialization */
static inline void MPIU_cc_set(MPIU_cc_t * cc_ptr, int val)
{
    if (val == 0) {
        /* values other than 0 do not enforce any ordering, and therefore do not
         * start a HB arc */
        /* MT FIXME using cc_set in this way is sloppy.  Sometimes the caller
         * really does know that the cc value may cleared, but more likely this
         * is just a hack to avoid the work of figuring out what the cc value
         * currently is and decrementing it instead. */
        /* barrier ensures that any state written before indicating completion is
         * seen by the thread polling on the cc.  If OPA adds store-release
         * semantics, we can convert to that instead. */
        OPA_write_barrier();
        MPL_VG_ANNOTATE_HAPPENS_BEFORE(cc_ptr);
    }

#if defined(MPL_VG_AVAILABLE)
    /* MT subtle: store_int is actually safe to use, but Helgrind/DRD/TSan all
     * view the store/load pair as a race.  Using an atomic operation for the
     * store side makes all three happy.  DRD & TSan also support
     * ANNOTATE_BENIGN_RACE, but Helgrind does not. */
    OPA_swap_int(cc_ptr, val);
#else
    OPA_store_int(cc_ptr, val);
#endif
}

ATTRIBUTE((unused))
static MPL_DBG_INLINE_KEYWORD int MPIU_cc_is_complete(MPIU_cc_t * cc_ptr)
{
    int complete;

    complete = (0 == OPA_load_int(cc_ptr));
    if (complete) {
        MPL_VG_ANNOTATE_HAPPENS_AFTER(cc_ptr);
        OPA_read_barrier();
    }

    return complete;
}

/* incomplete_==TRUE iff the cc > 0 after the decr */
#define MPIU_cc_decr(cc_ptr_, incomplete_)                      \
    do {                                                        \
        OPA_write_barrier();                                    \
        MPL_VG_ANNOTATE_HAPPENS_BEFORE(cc_ptr_);                \
        *(incomplete_) = !OPA_decr_and_test_int(cc_ptr_);       \
        /* TODO check if this HA is actually necessary */       \
        if (!*(incomplete_)) {                                  \
            MPL_VG_ANNOTATE_HAPPENS_AFTER(cc_ptr_);             \
        }                                                       \
    } while (0)

/* MT FIXME does this need a HB/HA annotation?  This macro is only used for
 * cancel_send right now. */
/* was_incomplete_==TRUE iff the cc==0 before the decr */
#define MPIU_cc_incr(cc_ptr_, was_incomplete_)                  \
    do {                                                        \
        *(was_incomplete_) = OPA_fetch_and_incr_int(cc_ptr_);   \
    } while (0)

#define MPIU_cc_get(cc_) OPA_load_int(&(cc_))

/* "publishes" the obj with handle value (handle_) via the handle pointer
 * (hnd_lval_).  That is, it is a version of the following statement that fixes
 * memory consistency issues:
 *     (hnd_lval_) = (handle_);
 *
 * assumes that the following is always true: typeof(*hnd_lval_ptr_)==int
 */
/* This could potentially be generalized beyond MPI-handle objects, but we
 * should only take that step after seeing good evidence of its use.  A general
 * macro (that is portable to non-gcc compilers) will need type information to
 * make the appropriate volatile cast. */
/* Ideally _GLOBAL would use this too, but we don't want to count on OPA
 * availability in _GLOBAL mode.  Instead the GLOBAL critical section should be
 * used. */
#define MPIU_OBJ_PUBLISH_HANDLE(hnd_lval_, handle_)                     \
    do {                                                                \
        if (MPIR_ThreadInfo.isThreaded) {                               \
            /* wmb ensures all read-only object field values are seen before the */ \
            /* handle value is seen at the application level */         \
            OPA_write_barrier();                                        \
            /* volatile ensures lval is not speculatively read or written */ \
            *(volatile int *)&(hnd_lval_) = (handle_);                  \
        }                                                               \
        else {                                                          \
            (hnd_lval_) = (handle_);                                    \
        }                                                               \
    } while (0)

#endif /* !defined(MPIU_THREAD_POBJ_H_INCLUDED) */