From ab70b7de7dd2dfbdfea893fbce52583d7094891d Mon Sep 17 00:00:00 2001
From: "Vinogradov, Sergei" <sergey.vinogradov@intel.com>
Date: Fri, 28 Jul 2023 13:30:09 -0400
Subject: [PATCH] Transparent Item movements

The new algorithm relies on the moving bit and does not require external
synchronization. Data movement happens transparently for the client: if
the client thread attempts to get a handle for the item being moved it
will get a handle with wait context to wait till the movement is
completed.
---
 cachelib/allocator/CacheAllocator-inl.h       | 709 ++++++++++--------
 cachelib/allocator/CacheAllocator.h           | 155 +++-
 cachelib/allocator/CacheItem.h                |   2 +-
 cachelib/allocator/MM2Q-inl.h                 |   6 +
 cachelib/allocator/MM2Q.h                     |   4 +
 cachelib/allocator/MMLru-inl.h                |   6 +
 cachelib/allocator/MMLru.h                    |   4 +
 cachelib/allocator/MMTinyLFU-inl.h            |   7 +
 cachelib/allocator/MMTinyLFU.h                |   4 +
 cachelib/allocator/Refcount.h                 |  29 +-
 .../allocator/tests/AllocatorTypeTest.cpp     |   4 +-
 cachelib/allocator/tests/BaseAllocatorTest.h  |  38 +-
 cachelib/allocator/tests/RefCountTest.cpp     |  10 +-
 cachelib/common/Mutex.h                       |   6 +
 14 files changed, 627 insertions(+), 357 deletions(-)
diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h
index 680fb104c7..608aa212fc 100644
--- a/cachelib/allocator/CacheAllocator-inl.h
+++ b/cachelib/allocator/CacheAllocator-inl.h
@@ -81,6 +81,8 @@ CacheAllocator<CacheTrait>::CacheAllocator(
                               config.chainedItemAccessConfig)),
       chainedItemLocks_(config_.chainedItemsLockPower,
                         std::make_shared<MurmurHash2>()),
+      movesMap_(kShards),
+      moveLock_(kShards),
       cacheCreationTime_{
           type != InitMemType::kMemAttach
               ? util::getCurrentTimeSec()
@@ -415,7 +417,7 @@ CacheAllocator<CacheTrait>::allocateChainedItem(const ReadHandle& parent,
         "Cannot call allocate chained item with a empty parent handle!");
   }
 
-  auto it = allocateChainedItemInternal(parent, size);
+  auto it = allocateChainedItemInternal(*parent, size);
   if (auto eventTracker = getEventTracker()) {
     const auto result =
         it ? AllocatorApiResult::ALLOCATED : AllocatorApiResult::FAILED;
@@ -427,8 +429,8 @@ CacheAllocator<CacheTrait>::allocateChainedItem(const ReadHandle& parent,
 
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
-CacheAllocator<CacheTrait>::allocateChainedItemInternal(
-    const ReadHandle& parent, uint32_t size) {
+CacheAllocator<CacheTrait>::allocateChainedItemInternal(const Item& parent,
+                                                        uint32_t size) {
   util::LatencyTracker tracker{stats().allocateLatency_};
 
   SCOPE_FAIL { stats_.invalidAllocs.inc(); };
@@ -436,7 +438,7 @@ CacheAllocator<CacheTrait>::allocateChainedItemInternal(
   // number of bytes required for this item
   const auto requiredSize = ChainedItem::getRequiredSize(size);
 
-  const auto pid = allocator_->getAllocInfo(parent->getMemory()).poolId;
+  const auto pid = allocator_->getAllocInfo(parent.getMemory()).poolId;
   const auto cid = allocator_->getAllocationClassId(pid, requiredSize);
 
   (*stats_.allocAttempts)[pid][cid].inc();
@@ -452,9 +454,8 @@ CacheAllocator<CacheTrait>::allocateChainedItemInternal(
 
   SCOPE_FAIL { allocator_->free(memory); };
 
-  auto child = acquire(
-      new (memory) ChainedItem(compressor_.compress(parent.getInternal()), size,
-                               util::getCurrentTimeSec()));
+  auto child = acquire(new (memory) ChainedItem(
+      compressor_.compress(&parent), size, util::getCurrentTimeSec()));
 
   if (child) {
     child.markNascent();
@@ -494,14 +495,15 @@ void CacheAllocator<CacheTrait>::addChainedItem(WriteHandle& parent,
   // Count a new child
   stats_.numChainedChildItems.inc();
 
-  insertInMMContainer(*child);
-
   // Increment refcount since this chained item is now owned by the parent
   // Parent will decrement the refcount upon release. Since this is an
   // internal refcount, we dont include it in active handle tracking.
-  child->incRef();
+  auto ret = child->incRef();
+  XDCHECK(ret == RefcountWithFlags::incResult::incOk);
   XDCHECK_EQ(2u, child->getRefCount());
 
+  insertInMMContainer(*child);
+
   invalidateNvm(*parent);
   if (auto eventTracker = getEventTracker()) {
     eventTracker->record(AllocatorApiEvent::ADD_CHAINED, parent->getKey(),
@@ -565,22 +567,20 @@ CacheAllocator<CacheTrait>::getParentKey(const Item& chainedItem) {
 }
 
 template <typename CacheTrait>
-void CacheAllocator<CacheTrait>::transferChainLocked(WriteHandle& parent,
+void CacheAllocator<CacheTrait>::transferChainLocked(Item& parent,
                                                      WriteHandle& newParent) {
   // parent must be in a state to not have concurrent readers. Eviction code
-  // paths rely on holding the last item handle. Since we hold on to an item
-  // handle here, the chain will not be touched by any eviction code path.
-  XDCHECK(parent);
+  // paths rely on holding the last item handle.
   XDCHECK(newParent);
-  XDCHECK_EQ(parent->getKey(), newParent->getKey());
-  XDCHECK(parent->hasChainedItem());
+  XDCHECK_EQ(parent.getKey(), newParent->getKey());
+  XDCHECK(parent.hasChainedItem());
 
   if (newParent->hasChainedItem()) {
     throw std::invalid_argument(folly::sformat(
         "New Parent {} has invalid state", newParent->toString()));
   }
 
-  auto headHandle = findChainedItem(*parent);
+  auto headHandle = findChainedItem(parent);
   XDCHECK(headHandle);
 
   // remove from the access container since we are changing the key
@@ -592,6 +592,7 @@ void CacheAllocator<CacheTrait>::transferChainLocked(WriteHandle& parent,
   while (curr) {
     XDCHECK_EQ(curr == headHandle.get() ? 2u : 1u, curr->getRefCount());
     XDCHECK(curr->isInMMContainer());
+    XDCHECK(!newParent->isMoving());
     curr->changeKey(newParentPtr);
     curr = curr->getNext(compressor_);
   }
@@ -603,7 +604,7 @@ void CacheAllocator<CacheTrait>::transferChainLocked(WriteHandle& parent,
         folly::sformat("Did not expect to find an existing chain for {}",
                        newParent->toString(), oldHead->toString()));
   }
-  parent->unmarkHasChainedItem();
+  parent.unmarkHasChainedItem();
 }
 
 template <typename CacheTrait>
@@ -614,7 +615,7 @@ void CacheAllocator<CacheTrait>::transferChainAndReplace(
   }
   { // scope for chained item lock
     auto l = chainedItemLocks_.lockExclusive(parent->getKey());
-    transferChainLocked(parent, newParent);
+    transferChainLocked(*parent, newParent);
   }
 
   if (replaceIfAccessible(*parent, *newParent)) {
@@ -681,33 +682,10 @@ CacheAllocator<CacheTrait>::replaceChainedItem(Item& oldItem,
 }
 
 template <typename CacheTrait>
-typename CacheAllocator<CacheTrait>::WriteHandle
-CacheAllocator<CacheTrait>::replaceChainedItemLocked(Item& oldItem,
-                                                     WriteHandle newItemHdl,
-                                                     const Item& parent) {
-  XDCHECK(newItemHdl != nullptr);
-  XDCHECK_GE(1u, oldItem.getRefCount());
-
-  // grab the handle to the old item so that we can return this. Also, we need
-  // to drop the refcount the parent holds on oldItem by manually calling
-  // decRef.  To do that safely we need to have a proper outstanding handle.
-  auto oldItemHdl = acquire(&oldItem);
-
-  // Replace the old chained item with new item in the MMContainer before we
-  // actually replace the old item in the chain
-
-  if (!replaceChainedItemInMMContainer(oldItem, *newItemHdl)) {
-    // This should never happen since we currently hold an valid
-    // parent handle. None of its chained items can be removed
-    throw std::runtime_error(folly::sformat(
-        "chained item cannot be replaced in MM container, oldItem={}, "
-        "newItem={}, parent={}",
-        oldItem.toString(), newItemHdl->toString(), parent.toString()));
-  }
-
-  XDCHECK(!oldItem.isInMMContainer());
-  XDCHECK(newItemHdl->isInMMContainer());
-
+void CacheAllocator<CacheTrait>::replaceInChainLocked(Item& oldItem,
+                                                      WriteHandle& newItemHdl,
+                                                      const Item& parent,
+                                                      bool fromMove) {
   auto head = findChainedItem(parent);
   XDCHECK(head != nullptr);
   XDCHECK_EQ(reinterpret_cast<uintptr_t>(
@@ -736,16 +714,62 @@ CacheAllocator<CacheTrait>::replaceChainedItemLocked(Item& oldItem,
       oldItem.asChainedItem().getNext(compressor_), compressor_);
   oldItem.asChainedItem().setNext(nullptr, compressor_);
 
-  // this should not result in 0 refcount. We are bumping down the internal
-  // refcount. If it did, we would leak an item.
-  oldItem.decRef();
-  XDCHECK_LT(0u, oldItem.getRefCount()) << oldItem.toString();
+  // if called from moveChainedItem then ref will be zero, else
+  // greater than 0
+  if (fromMove) {
+    // if this is the head chained item, release the handle now
+    // while refCount > 1 so that the destructor does not
+    // call releaseBackToAllocator since we want recycle oldItem
+    if (head) {
+      head.reset();
+      XDCHECK_EQ(1u, oldItem.getRefCount());
+    }
+    oldItem.decRef();
+    XDCHECK_EQ(0u, oldItem.getRefCount()) << oldItem.toString();
+  } else {
+    oldItem.decRef();
+    XDCHECK_LT(0u, oldItem.getRefCount()) << oldItem.toString();
+  }
 
   // increment refcount to indicate parent owns this similar to addChainedItem
   // Since this is an internal refcount, we dont include it in active handle
   // tracking.
 
-  newItemHdl->incRef();
+  auto ret = newItemHdl->incRef();
+  XDCHECK(ret == RefcountWithFlags::incResult::incOk);
+}
+
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::replaceChainedItemLocked(Item& oldItem,
+                                                     WriteHandle newItemHdl,
+                                                     const Item& parent) {
+  XDCHECK(newItemHdl != nullptr);
+  XDCHECK_GE(1u, oldItem.getRefCount());
+
+  // grab the handle to the old item so that we can return this. Also, we need
+  // to drop the refcount the parent holds on oldItem by manually calling
+  // decRef.  To do that safely we need to have a proper outstanding handle.
+  auto oldItemHdl = acquire(&oldItem);
+  XDCHECK_GE(2u, oldItem.getRefCount());
+
+  // Replace the old chained item with new item in the MMContainer before we
+  // actually replace the old item in the chain
+
+  if (!replaceChainedItemInMMContainer(oldItem, *newItemHdl)) {
+    // This should never happen since we currently hold an valid
+    // parent handle. None of its chained items can be removed
+    throw std::runtime_error(folly::sformat(
+        "chained item cannot be replaced in MM container, oldItem={}, "
+        "newItem={}, parent={}",
+        oldItem.toString(), newItemHdl->toString(), parent.toString()));
+  }
+
+  XDCHECK(!oldItem.isInMMContainer());
+  XDCHECK(newItemHdl->isInMMContainer());
+
+  replaceInChainLocked(oldItem, newItemHdl, parent, false);
+
   return oldItemHdl;
 }
 
@@ -910,12 +934,12 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
 }
 
 template <typename CacheTrait>
-bool CacheAllocator<CacheTrait>::incRef(Item& it) {
-  if (it.incRef()) {
+RefcountWithFlags::incResult CacheAllocator<CacheTrait>::incRef(Item& it) {
+  auto ret = it.incRef();
+  if (ret == RefcountWithFlags::incResult::incOk) {
     ++handleCount_.tlStats();
-    return true;
   }
-  return false;
+  return ret;
 }
 
 template <typename CacheTrait>
@@ -935,11 +959,19 @@ CacheAllocator<CacheTrait>::acquire(Item* it) {
 
   SCOPE_FAIL { stats_.numRefcountOverflow.inc(); };
 
-  if (LIKELY(incRef(*it))) {
-    return WriteHandle{it, *this};
-  } else {
-    // item is being evicted
-    return WriteHandle{};
+  while (true) {
+    auto incRes = incRef(*it);
+    if (LIKELY(incRes == RefcountWithFlags::incResult::incOk)) {
+      return WriteHandle{it, *this};
+    } else if (incRes == RefcountWithFlags::incResult::incFailedEviction) {
+      // item is being evicted
+      return WriteHandle{};
+    } else {
+      // item is being moved - wait for completion
+      WriteHandle handle;
+      if (tryGetHandleWithWaitContextForMovingItem(*it, handle))
+        return handle;
+    }
   }
 }
 
@@ -1127,19 +1159,71 @@ CacheAllocator<CacheTrait>::insertOrReplace(const WriteHandle& handle) {
   return replaced;
 }
 
+/* Next two methods are used to asynchronously move Item between Slabs.
+ *
+ * The thread, which moves Item, allocates new Item in the tier we are moving to
+ * and calls moveRegularItem() method. This method does the following:
+ *  1. Update the access container with the new item from the tier we are
+ *     moving to. This Item has moving flag set.
+ *  2. Copy data from the old Item to the new one.
+ *
+ * Concurrent threads which are getting handle to the same key:
+ *  1. When a handle is created it checks if the moving flag is set
+ *  2. If so, Handle implementation creates waitContext and adds it to the
+ *     MoveCtx by calling handleWithWaitContextForMovingItem() method.
+ *  3. Wait until the moving thread will complete its job.
+ */
+template <typename CacheTrait>
+bool CacheAllocator<CacheTrait>::tryGetHandleWithWaitContextForMovingItem(
+    Item& item, WriteHandle& handle) {
+  auto shard = getShardForKey(item.getKey());
+  auto& movesMap = getMoveMapForShard(shard);
+  {
+    auto lock = getMoveLockForShard(shard);
+
+    // item might have been evicted or moved before the lock was acquired
+    if (!item.isMoving())
+      return false;
+
+    WriteHandle hdl{*this};
+    auto waitContext = hdl.getItemWaitContext();
+
+    auto ret = movesMap.try_emplace(item.getKey(), std::make_unique<MoveCtx>());
+    ret.first->second->addWaiter(std::move(waitContext));
+
+    handle = std::move(hdl);
+    return true;
+  }
+}
+
+template <typename CacheTrait>
+size_t CacheAllocator<CacheTrait>::wakeUpWaitersLocked(folly::StringPiece key,
+                                                       WriteHandle&& handle) {
+  std::unique_ptr<MoveCtx> ctx;
+  auto shard = getShardForKey(key);
+  auto& movesMap = getMoveMapForShard(shard);
+  {
+    auto lock = getMoveLockForShard(shard);
+    movesMap.eraseInto(
+        key, [&](auto&& key, auto&& value) { ctx = std::move(value); });
+  }
+
+  if (ctx) {
+    ctx->setItemHandle(std::move(handle));
+    return ctx->numWaiters();
+  }
+
+  return 0;
+}
+
 template <typename CacheTrait>
 bool CacheAllocator<CacheTrait>::moveRegularItem(Item& oldItem,
                                                  WriteHandle& newItemHdl) {
-  XDCHECK(config_.moveCb);
+  XDCHECK(oldItem.isMoving());
+  XDCHECK(!oldItem.isExpired());
   util::LatencyTracker tracker{stats_.moveRegularLatency_};
 
-  if (!oldItem.isAccessible() || oldItem.isExpired()) {
-    return false;
-  }
-
   XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize());
-  XDCHECK_EQ(reinterpret_cast<uintptr_t>(&getMMContainer(oldItem)),
-             reinterpret_cast<uintptr_t>(&getMMContainer(*newItemHdl)));
 
   // take care of the flags before we expose the item to be accessed. this
   // will ensure that when another thread removes the item from RAM, we issue
@@ -1148,52 +1232,36 @@ bool CacheAllocator<CacheTrait>::moveRegularItem(Item& oldItem,
     newItemHdl->markNvmClean();
   }
 
-  // Execute the move callback. We cannot make any guarantees about the
-  // consistency of the old item beyond this point, because the callback can
-  // do more than a simple memcpy() e.g. update external references. If there
-  // are any remaining handles to the old item, it is the caller's
-  // responsibility to invalidate them. The move can only fail after this
-  // statement if the old item has been removed or replaced, in which case it
-  // should be fine for it to be left in an inconsistent state.
-  config_.moveCb(oldItem, *newItemHdl, nullptr);
-
-  // Inside the access container's lock, this checks if the old item is
-  // accessible and its refcount is zero. If the item is not accessible,
-  // there is no point to replace it since it had already been removed
-  // or in the process of being removed. If the item is in cache but the
-  // refcount is non-zero, it means user could be attempting to remove
-  // this item through an API such as remove(itemHandle). In this case,
-  // it is unsafe to replace the old item with a new one, so we should
-  // also abort.
-  if (!accessContainer_->replaceIf(oldItem, *newItemHdl,
-                                   itemExclusivePredicate)) {
-    return false;
-  }
-
-  // Inside the MM container's lock, this checks if the old item exists to
-  // make sure that no other thread removed it, and only then replaces it.
-  if (!replaceInMMContainer(oldItem, *newItemHdl)) {
-    accessContainer_->remove(*newItemHdl);
-    return false;
+  if (config_.moveCb) {
+    // Execute the move callback. We cannot make any guarantees about the
+    // consistency of the old item beyond this point, because the callback can
+    // do more than a simple memcpy() e.g. update external references. If there
+    // are any remaining handles to the old item, it is the caller's
+    // responsibility to invalidate them. The move can only fail after this
+    // statement if the old item has been removed or replaced, in which case it
+    // should be fine for it to be left in an inconsistent state.
+    config_.moveCb(oldItem, *newItemHdl, nullptr);
+  } else {
+    std::memcpy(newItemHdl->getMemory(), oldItem.getMemory(),
+                oldItem.getSize());
   }
 
-  // Replacing into the MM container was successful, but someone could have
-  // called insertOrReplace() or remove() before or after the
-  // replaceInMMContainer() operation, which would invalidate newItemHdl.
-  if (!newItemHdl->isAccessible()) {
-    removeFromMMContainer(*newItemHdl);
-    return false;
-  }
+  // Adding the item to mmContainer has to succeed since no one can remove the
+  // item
+  auto& newContainer = getMMContainer(*newItemHdl);
+  auto mmContainerAdded = newContainer.add(*newItemHdl);
+  XDCHECK(mmContainerAdded);
 
-  // no one can add or remove chained items at this point
   if (oldItem.hasChainedItem()) {
-    // safe to acquire handle for a moving Item
-    auto oldHandle = acquire(&oldItem);
-    XDCHECK_EQ(1u, oldHandle->getRefCount()) << oldHandle->toString();
     XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString();
     try {
-      auto l = chainedItemLocks_.lockExclusive(oldItem.getKey());
-      transferChainLocked(oldHandle, newItemHdl);
+      auto l = chainedItemLocks_.tryLockExclusive(oldItem.getKey());
+      if (l) {
+        transferChainLocked(oldItem, newItemHdl);
+      } else {
+        newContainer.remove(*newItemHdl);
+        return false;
+      }
     } catch (const std::exception& e) {
       // this should never happen because we drained all the handles.
       XLOGF(DFATAL, "{}", e.what());
@@ -1203,6 +1271,18 @@ bool CacheAllocator<CacheTrait>::moveRegularItem(Item& oldItem,
     XDCHECK(!oldItem.hasChainedItem());
     XDCHECK(newItemHdl->hasChainedItem());
   }
+
+  auto predicate = [&](const Item& item) {
+    // we rely on moving flag being set (it should block all readers)
+    XDCHECK_EQ(item.getRefCount(), 0);
+    XDCHECK(item.isMoving());
+    return true;
+  };
+  if (!accessContainer_->replaceIf(oldItem, *newItemHdl, predicate)) {
+    newContainer.remove(*newItemHdl);
+    return false;
+  }
+
   newItemHdl.unmarkNascent();
   return true;
 }
@@ -1210,64 +1290,71 @@ bool CacheAllocator<CacheTrait>::moveRegularItem(Item& oldItem,
 template <typename CacheTrait>
 bool CacheAllocator<CacheTrait>::moveChainedItem(ChainedItem& oldItem,
                                                  WriteHandle& newItemHdl) {
-  XDCHECK(config_.moveCb);
+  Item& parentItem = oldItem.getParentItem(compressor_);
+  XDCHECK(parentItem.isMoving());
   util::LatencyTracker tracker{stats_.moveChainedLatency_};
 
-  // This item has been unlinked from its parent and we're the only
-  // owner of it, so we're done here
-  if (!oldItem.isInMMContainer() || oldItem.isOnlyMoving()) {
-    return false;
-  }
-
-  const auto parentKey = oldItem.getParentItem(compressor_).getKey();
-
-  // Grab lock to prevent anyone else from modifying the chain
-  auto l = chainedItemLocks_.lockExclusive(parentKey);
-
-  auto parentHandle =
-      validateAndGetParentHandleForChainedMoveLocked(oldItem, parentKey);
-  if (!parentHandle) {
+  auto& expectedParent = oldItem.getParentItem(compressor_);
+  const auto parentKey = expectedParent.getKey();
+  auto l = chainedItemLocks_.tryLockExclusive(parentKey);
+  if (!l) {
     return false;
   }
+  XDCHECK_EQ(&expectedParent, &parentItem);
 
-  // once we have the moving sync and valid parent for the old item, check if
+  // check if
   // the original allocation was made correctly. If not, we destroy the
   // allocation to indicate a retry to moving logic above.
   if (reinterpret_cast<uintptr_t>(
           &newItemHdl->asChainedItem().getParentItem(compressor_)) !=
-      reinterpret_cast<uintptr_t>(&parentHandle->asChainedItem())) {
-    newItemHdl.reset();
+      reinterpret_cast<uintptr_t>(&parentItem.asChainedItem())) {
+    XDCHECK(false);
     return false;
   }
 
   XDCHECK_EQ(reinterpret_cast<uintptr_t>(
                  &newItemHdl->asChainedItem().getParentItem(compressor_)),
-             reinterpret_cast<uintptr_t>(&parentHandle->asChainedItem()));
+             reinterpret_cast<uintptr_t>(&parentItem.asChainedItem()));
 
-  // In case someone else had removed this chained item from its parent by now
-  // So we check again to see if the it has been unlinked from its parent
-  if (!oldItem.isInMMContainer() || oldItem.isOnlyMoving()) {
-    return false;
-  }
-
-  auto parentPtr = parentHandle.getInternal();
+  auto parentPtr = &parentItem;
 
   XDCHECK_EQ(reinterpret_cast<uintptr_t>(parentPtr),
              reinterpret_cast<uintptr_t>(&oldItem.getParentItem(compressor_)));
 
-  // Invoke the move callback to fix up any user data related to the chain
-  config_.moveCb(oldItem, *newItemHdl, parentPtr);
+  if (config_.moveCb) {
+    // Execute the move callback. We cannot make any guarantees about the
+    // consistency of the old item beyond this point, because the callback can
+    // do more than a simple memcpy() e.g. update external references. If there
+    // are any remaining handles to the old item, it is the caller's
+    // responsibility to invalidate them. The move can only fail after this
+    // statement if the old item has been removed or replaced, in which case it
+    // should be fine for it to be left in an inconsistent state.
+    config_.moveCb(oldItem, *newItemHdl, parentPtr);
+  } else {
+    std::memcpy(newItemHdl->getMemory(), oldItem.getMemory(),
+                oldItem.getSize());
+  }
 
   // Replace the new item in the position of the old one before both in the
   // parent's chain and the MMContainer.
-  auto oldItemHandle =
-      replaceChainedItemLocked(oldItem, std::move(newItemHdl), *parentHandle);
-  XDCHECK(oldItemHandle->isMoving());
-  XDCHECK(!oldItemHandle->isInMMContainer());
+  XDCHECK_EQ(parentItem.getRefCount(), 0);
+  auto& newContainer = getMMContainer(*newItemHdl);
+  auto mmContainerAdded = newContainer.add(*newItemHdl);
+  XDCHECK(mmContainerAdded);
+
+  replaceInChainLocked(oldItem, newItemHdl, parentItem, true);
 
   return true;
 }
 
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::NvmCacheT::PutToken
+CacheAllocator<CacheTrait>::createPutToken(Item& item) {
+  const bool evictToNvmCache = shouldWriteToNvmCache(item);
+  return evictToNvmCache ? nvmCache_->createPutToken(item.getKey())
+                         : typename NvmCacheT::PutToken{};
+}
+
 template <typename CacheTrait>
 void CacheAllocator<CacheTrait>::unlinkItemForEviction(Item& it) {
   XDCHECK(it.isMarkedForEviction());
@@ -2478,6 +2565,8 @@ void CacheAllocator<CacheTrait>::releaseSlabImpl(
   //  3. If 2 is successful, Move or Evict
   //  4. Move on to the next item if current item is freed
   for (auto alloc : releaseContext.getActiveAllocations()) {
+    Item& item = *static_cast<Item*>(alloc);
+
     // Need to mark an item for release before proceeding
     // If we can't mark as moving, it means the item is already freed
     const bool isAlreadyFreed =
@@ -2486,8 +2575,6 @@ void CacheAllocator<CacheTrait>::releaseSlabImpl(
       continue;
     }
 
-    Item& item = *static_cast<Item*>(alloc);
-
     // Try to move this item and make sure we can free the memory
     const bool isMoved = moveForSlabRelease(releaseContext, item, throttler);
 
@@ -2509,6 +2596,15 @@ void CacheAllocator<CacheTrait>::throttleWith(util::Throttler& t,
   }
 }
 
+template <typename CacheTrait>
+typename RefcountWithFlags::Value
+CacheAllocator<CacheTrait>::unmarkMovingAndWakeUpWaiters(Item& item,
+                                                         WriteHandle handle) {
+  auto ret = item.unmarkMoving();
+  wakeUpWaiters(item, std::move(handle));
+  return ret;
+}
+
 template <typename CacheTrait>
 bool CacheAllocator<CacheTrait>::moveForSlabRelease(
     const SlabReleaseContext& ctx, Item& oldItem, util::Throttler& throttler) {
@@ -2518,42 +2614,36 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(
 
   bool isMoved = false;
   auto startTime = util::getCurrentTimeSec();
-  WriteHandle newItemHdl = allocateNewItemForOldItem(oldItem);
-
-  for (unsigned int itemMovingAttempts = 0;
-       itemMovingAttempts < config_.movingTries;
-       ++itemMovingAttempts) {
-    stats_.numMoveAttempts.inc();
+  Item* parentItem;
+  bool chainedItem = oldItem.isChainedItem();
 
-    // Nothing to move and the key is likely also bogus for chained items.
-    if (oldItem.isOnlyMoving()) {
-      oldItem.unmarkMoving();
-      const auto res =
-          releaseBackToAllocator(oldItem, RemoveContext::kNormal, false);
-      XDCHECK(res == ReleaseRes::kReleased);
-      return true;
-    }
+  stats_.numMoveAttempts.inc();
 
-    if (!newItemHdl) {
-      // try to allocate again if it previously wasn't successful
-      newItemHdl = allocateNewItemForOldItem(oldItem);
-    }
+  // Nothing to move - in the case that tryMoving failed
+  // for chained items we would have already evicted the entire chain.
+  if (oldItem.isOnlyMoving()) {
+    XDCHECK(!chainedItem);
+    auto ret = unmarkMovingAndWakeUpWaiters(oldItem, {});
+    XDCHECK(ret == 0);
+    const auto res =
+        releaseBackToAllocator(oldItem, RemoveContext::kNormal, false);
+    XDCHECK(res == ReleaseRes::kReleased);
+    return true;
+  }
 
-    // if we have a valid handle, try to move, if not, we retry.
-    if (newItemHdl) {
-      isMoved = tryMovingForSlabRelease(oldItem, newItemHdl);
-      if (isMoved) {
-        break;
-      }
-    }
+  if (chainedItem) {
+    parentItem = &oldItem.asChainedItem().getParentItem(compressor_);
+    XDCHECK(parentItem->isMoving());
+    XDCHECK_EQ(1, oldItem.getRefCount());
+    XDCHECK_EQ(0, parentItem->getRefCount());
+  } else {
+    XDCHECK(oldItem.isMoving());
+  }
+  WriteHandle newItemHdl = allocateNewItemForOldItem(oldItem);
 
-    throttleWith(throttler, [&] {
-      XLOGF(WARN,
-            "Spent {} seconds, slab release still trying to move Item: {}. "
-            "Pool: {}, Class: {}.",
-            util::getCurrentTimeSec() - startTime, oldItem.toString(),
-            ctx.getPoolId(), ctx.getClassId());
-    });
+  // if we have a valid handle, try to move, if not, we retry.
+  if (newItemHdl) {
+    isMoved = tryMovingForSlabRelease(oldItem, newItemHdl);
   }
 
   // Return false if we've exhausted moving tries.
@@ -2561,21 +2651,27 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(
     return false;
   }
 
-  // Since item has been moved, we can directly free it. We don't need to
-  // worry about any stats related changes, because there is another item
-  // that's identical to this one to replace it. Here we just need to wait
-  // until all users have dropped the item handles before we can proceed.
-  startTime = util::getCurrentTimeSec();
-  while (!oldItem.isOnlyMoving()) {
-    throttleWith(throttler, [&] {
-      XLOGF(WARN,
-            "Spent {} seconds, slab release still waiting for refcount to "
-            "drain Item: {}. Pool: {}, Class: {}.",
-            util::getCurrentTimeSec() - startTime, oldItem.toString(),
-            ctx.getPoolId(), ctx.getClassId());
-    });
-  }
   const auto allocInfo = allocator_->getAllocInfo(oldItem.getMemory());
+  if (chainedItem) {
+    newItemHdl.reset();
+    auto ref = parentItem->unmarkMoving();
+    if (UNLIKELY(ref == 0)) {
+      wakeUpWaiters(*parentItem, {});
+      const auto res =
+          releaseBackToAllocator(*parentItem, RemoveContext::kNormal, false);
+      XDCHECK(res == ReleaseRes::kReleased);
+      return true;
+    } else {
+      XDCHECK_NE(ref, 0);
+      auto parentHdl = acquire(parentItem);
+      if (parentHdl) {
+        wakeUpWaiters(*parentItem, std::move(parentHdl));
+      }
+    }
+  } else {
+    auto ref = unmarkMovingAndWakeUpWaiters(oldItem, std::move(newItemHdl));
+    XDCHECK(ref == 0);
+  }
   allocator_->free(&oldItem);
 
   (*stats_.fragmentationSize)[allocInfo.poolId][allocInfo.classId].sub(
@@ -2584,53 +2680,21 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(
   return true;
 }
 
-template <typename CacheTrait>
-typename CacheAllocator<CacheTrait>::ReadHandle
-CacheAllocator<CacheTrait>::validateAndGetParentHandleForChainedMoveLocked(
-    const ChainedItem& item, const Key& parentKey) {
-  ReadHandle parentHandle{};
-  try {
-    parentHandle = findInternal(parentKey);
-    // If the parent is not the same as the parent of the chained item,
-    // it means someone has replaced our old parent already. So we abort.
-    if (!parentHandle ||
-        parentHandle.get() != &item.getParentItem(compressor_)) {
-      return {};
-    }
-  } catch (const exception::RefcountOverflow&) {
-    return {};
-  }
-
-  return parentHandle;
-}
-
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
 CacheAllocator<CacheTrait>::allocateNewItemForOldItem(const Item& oldItem) {
   if (oldItem.isChainedItem()) {
-    const auto& oldChainedItem = oldItem.asChainedItem();
-    const auto parentKey = oldChainedItem.getParentItem(compressor_).getKey();
+    const Item& parentItem = oldItem.asChainedItem().getParentItem(compressor_);
 
-    // Grab lock to prevent anyone else from modifying the chain
-    auto l = chainedItemLocks_.lockExclusive(parentKey);
-
-    auto parentHandle = validateAndGetParentHandleForChainedMoveLocked(
-        oldChainedItem, parentKey);
-    if (!parentHandle) {
-      return {};
-    }
-
-    // Set up the destination for the move. Since oldChainedItem would be
-    // marked as moving, it won't be picked for eviction.
     auto newItemHdl =
-        allocateChainedItemInternal(parentHandle, oldChainedItem.getSize());
+        allocateChainedItemInternal(parentItem, oldItem.getSize());
     if (!newItemHdl) {
       return {};
     }
 
+    const auto& oldChainedItem = oldItem.asChainedItem();
     XDCHECK_EQ(newItemHdl->getSize(), oldChainedItem.getSize());
-    auto parentPtr = parentHandle.getInternal();
-    XDCHECK_EQ(reinterpret_cast<uintptr_t>(parentPtr),
+    XDCHECK_EQ(reinterpret_cast<uintptr_t>(&parentItem),
                reinterpret_cast<uintptr_t>(
                    &oldChainedItem.getParentItem(compressor_)));
 
@@ -2662,38 +2726,20 @@ CacheAllocator<CacheTrait>::allocateNewItemForOldItem(const Item& oldItem) {
 template <typename CacheTrait>
 bool CacheAllocator<CacheTrait>::tryMovingForSlabRelease(
     Item& oldItem, WriteHandle& newItemHdl) {
-  // By holding onto a user-level synchronization object, we ensure moving
-  // a regular item or chained item is synchronized with any potential
-  // user-side mutation.
-  std::unique_ptr<SyncObj> syncObj;
-  if (config_.movingSync) {
-    if (!oldItem.isChainedItem()) {
-      syncObj = config_.movingSync(oldItem.getKey());
-    } else {
-      // Copy the key so we have a valid key to work with if the chained
-      // item is still valid.
-      const std::string parentKey =
-          oldItem.asChainedItem().getParentItem(compressor_).getKey().str();
-      if (oldItem.isOnlyMoving()) {
-        // If chained item no longer has a refcount, its parent is already
-        // being released, so we abort this try to moving.
-        return false;
-      }
-      syncObj = config_.movingSync(parentKey);
-    }
-
-    // We need to differentiate between the following three scenarios:
-    // 1. nullptr indicates no move sync required for this particular item
-    // 2. moveSync.isValid() == true meaning we've obtained the sync
-    // 3. moveSync.isValid() == false meaning we need to abort and retry
-    if (syncObj && !syncObj->isValid()) {
-      return false;
-    }
-  }
+  // move can fail if another thread calls insertOrReplace
+  // in this case oldItem is no longer valid (not accessible,
+  // it gets removed from MMContainer and evictForSlabRelease
+  // will send it back to the allocator
+  bool ret = oldItem.isChainedItem()
+                 ? moveChainedItem(oldItem.asChainedItem(), newItemHdl)
+                 : moveRegularItem(oldItem, newItemHdl);
+  removeFromMMContainer(oldItem);
+  return ret;
+}
 
-  return oldItem.isChainedItem()
-             ? moveChainedItem(oldItem.asChainedItem(), newItemHdl)
-             : moveRegularItem(oldItem, newItemHdl);
+template <typename CacheTrait>
+void CacheAllocator<CacheTrait>::wakeUpWaiters(Item& item, WriteHandle handle) {
+  wakeUpWaitersLocked(item.getKey(), std::move(handle));
 }
 
 template <typename CacheTrait>
@@ -2701,62 +2747,21 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(
     const SlabReleaseContext& ctx, Item& item, util::Throttler& throttler) {
   auto startTime = util::getCurrentTimeSec();
   while (true) {
+    // we can't rely on an item being marked moving because
+    // it may have previously been a chained item
     stats_.numEvictionAttempts.inc();
 
-    // if the item is already in a state where only the exclusive bit is set,
-    // nothing needs to be done. We simply need to call unmarkMoving and free
-    // the item.
-    if (item.isOnlyMoving()) {
-      item.unmarkMoving();
-      const auto res =
-          releaseBackToAllocator(item, RemoveContext::kNormal, false);
-      XDCHECK(ReleaseRes::kReleased == res);
-      return;
-    }
-
-    // Since we couldn't move, we now evict this item. Owning handle will be
-    // the item's handle for regular/normal items and will be the parent
-    // handle for chained items.
-    auto owningHandle =
-        item.isChainedItem()
-            ? evictChainedItemForSlabRelease(item.asChainedItem())
-            : evictNormalItemForSlabRelease(item);
-
-    // we managed to evict the corresponding owner of the item and have the
-    // last handle for the owner.
-    if (owningHandle) {
-      const auto allocInfo =
-          allocator_->getAllocInfo(static_cast<const void*>(&item));
-      if (owningHandle->hasChainedItem()) {
-        (*stats_.chainedItemEvictions)[allocInfo.poolId][allocInfo.classId]
-            .inc();
-      } else {
-        (*stats_.regularItemEvictions)[allocInfo.poolId][allocInfo.classId]
-            .inc();
-      }
-
-      stats_.numEvictionSuccesses.inc();
-
-      // we have the last handle. no longer need to hold on to the exclusive bit
-      item.unmarkMoving();
-
-      // manually decrement the refcount to call releaseBackToAllocator
-      const auto ref = decRef(*owningHandle);
-      XDCHECK(ref == 0);
-      const auto res = releaseBackToAllocator(*owningHandle.release(),
-                                              RemoveContext::kEviction, false);
-      XDCHECK(res == ReleaseRes::kReleased);
-      return;
-    }
-
     if (shutDownInProgress_) {
-      item.unmarkMoving();
+      if (item.isMoving()) {
+        auto ref = unmarkMovingAndWakeUpWaiters(item, {});
+      }
       allocator_->abortSlabRelease(ctx);
       throw exception::SlabReleaseAborted(
           folly::sformat("Slab Release aborted while trying to evict"
                          " Item: {} Pool: {}, Class: {}.",
                          item.toString(), ctx.getPoolId(), ctx.getClassId()));
     }
+
     throttleWith(throttler, [&] {
       XLOGF(WARN,
             "Spent {} seconds, slab release still trying to evict Item: {}. "
@@ -2770,6 +2775,64 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(
                                        .toString())
                   : "");
     });
+
+    // if the item is already in a state where only the exclusive bit is set,
+    // nothing needs to be done. We simply need to call unmarkMoving and free
+    // the item.
+    if (item.isOnlyMoving()) {
+      auto ref = unmarkMovingAndWakeUpWaiters(item, {});
+      XDCHECK(ref == 0);
+      const auto res =
+          releaseBackToAllocator(item, RemoveContext::kNormal, false);
+      XDCHECK(ReleaseRes::kReleased == res);
+      return;
+    }
+
+    typename NvmCacheT::PutToken token;
+    bool isChainedItem = item.isChainedItem();
+    Item* evicted = isChainedItem
+                        ? &item.asChainedItem().getParentItem(compressor_)
+                        : &item;
+
+    XDCHECK(evicted->isMoving());
+    token = createPutToken(*evicted);
+    auto ret = evicted->markForEvictionWhenMoving();
+    XDCHECK(ret);
+    // unmark the child so it will be freed
+    // TODO entire chain just gets evicted since moveForSlabRelease
+    // returns false
+    XDCHECK(!item.isMoving());
+    unlinkItemForEviction(*evicted);
+    // wake up any readers that wait for the move to complete
+    // it's safe to do now, as we have the item marked exclusive and
+    // no other reader can be added to the waiters list
+    wakeUpWaiters(*evicted, {});
+
+    if (token.isValid() && shouldWriteToNvmCacheExclusive(*evicted)) {
+      nvmCache_->put(*evicted, std::move(token));
+    }
+
+    const auto allocInfo =
+        allocator_->getAllocInfo(static_cast<const void*>(&item));
+    if (evicted->hasChainedItem()) {
+      (*stats_.chainedItemEvictions)[allocInfo.poolId][allocInfo.classId].inc();
+    } else {
+      (*stats_.regularItemEvictions)[allocInfo.poolId][allocInfo.classId].inc();
+    }
+
+    stats_.numEvictionSuccesses.inc();
+
+    XDCHECK(evicted->getRefCount() == 0);
+    const auto res =
+        releaseBackToAllocator(*evicted, RemoveContext::kEviction, false);
+    XDCHECK(res == ReleaseRes::kReleased);
+
+    const bool isAlreadyFreed =
+        !markMovingForSlabRelease(ctx, &item, throttler);
+    if (!isAlreadyFreed) {
+      continue;
+    }
+    return;
   }
 }
 
@@ -2945,14 +3008,47 @@ bool CacheAllocator<CacheTrait>::markMovingForSlabRelease(
 
   // At first, we assume this item was already freed
   bool itemFreed = true;
+  Item* syncItem = nullptr;
   bool markedMoving = false;
-  const auto fn = [&markedMoving, &itemFreed](void* memory) {
+  const auto fn = [this, &syncItem, &markedMoving, &itemFreed](void* memory) {
     // Since this callback is executed, the item is not yet freed
     itemFreed = false;
     Item* item = static_cast<Item*>(memory);
-    if (item->markMoving()) {
-      markedMoving = true;
-    }
+    auto& mmContainer = getMMContainer(*item);
+    mmContainer.withContainerLock([this, &mmContainer, &syncItem, &item,
+                                   &markedMoving]() {
+      // we rely on the mmContainer lock to safely check that the item is
+      // currently in the mmContainer (no other threads are currently allocating
+      // this item). This is needed to sync on the case where a chained item
+      // is being released back to allocator and it's parent ref could be
+      // invalid. We need a valid parent ref in order to mark a chained item
+      // as moving since we sync on the parent by marking it as moving.
+      if (!item->isInMMContainer()) {
+        return;
+      }
+      bool chainedItem_ = item->isChainedItem();
+      XDCHECK_EQ(&getMMContainer(*item), &mmContainer);
+      Item* syncItem_ = chainedItem_
+                            ? &item->asChainedItem().getParentItem(compressor_)
+                            : item;
+      // in order to safely check if the expected parent (syncItem_) matches
+      // the current parent on the chained item, we need to take the chained
+      // item lock so we are sure that nobody else will be editing the chain
+      auto l_ = chainedItem_
+                    ? chainedItemLocks_.tryLockExclusive(syncItem_->getKey())
+                    : decltype(chainedItemLocks_.tryLockExclusive(
+                          syncItem_->getKey()))();
+
+      if (chainedItem_ && (!l_ || &item->asChainedItem().getParentItem(
+                                      compressor_) != syncItem_)) {
+        markedMoving = false;
+        return;
+      }
+      if (syncItem_->markMoving()) {
+        markedMoving = true;
+        syncItem = syncItem_;
+      }
+    });
   };
 
   auto startTime = util::getCurrentTimeSec();
@@ -2964,6 +3060,8 @@ bool CacheAllocator<CacheTrait>::markMovingForSlabRelease(
     if (itemFreed) {
       return false;
     } else if (markedMoving) {
+      Item* item = static_cast<Item*>(alloc);
+      XDCHECK(syncItem->isMoving());
       return true;
     }
 
@@ -2979,6 +3077,7 @@ bool CacheAllocator<CacheTrait>::markMovingForSlabRelease(
                          static_cast<Item*>(alloc)->toString(), ctx.getPoolId(),
                          ctx.getClassId()));
     }
+    stats_.numMoveAttempts.inc();
     throttleWith(throttler, [&] {
       XLOGF(WARN,
             "Spent {} seconds, slab release still trying to mark as moving for "
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index 6888e2fd41..0a3b8f1d41 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -1349,7 +1349,7 @@ class CacheAllocator : public CacheBase {
 
  private:
   // wrapper around Item's refcount and active handle tracking
-  FOLLY_ALWAYS_INLINE bool incRef(Item& it);
+  FOLLY_ALWAYS_INLINE RefcountWithFlags::incResult incRef(Item& it);
   FOLLY_ALWAYS_INLINE RefcountWithFlags::Value decRef(Item& it);
 
   // drops the refcount and if needed, frees the allocation back to the memory
@@ -1473,26 +1473,13 @@ class CacheAllocator : public CacheBase {
   // The parent handle parameter here is mainly used to find the
   // correct pool to allocate memory for this chained item
   //
-  // @param parent    handle to the cache item
+  // @param parent    the parent item
   // @param size      the size for the chained allocation
   //
   // @return    handle to the chained allocation
   // @throw     std::invalid_argument if the size requested is invalid or
   //            if the item is invalid
-  WriteHandle allocateChainedItemInternal(const ReadHandle& parent,
-                                          uint32_t size);
-
-  // Given an item and its parentKey, validate that the parentKey
-  // corresponds to an item that's the parent of the supplied item.
-  //
-  // @param item       item that we want to get the parent handle for
-  // @param parentKey  key of the item's parent
-  //
-  // @return  handle to the parent item if the validations pass
-  //          otherwise, an empty Handle is returned.
-  //
-  ReadHandle validateAndGetParentHandleForChainedMoveLocked(
-      const ChainedItem& item, const Key& parentKey);
+  WriteHandle allocateChainedItemInternal(const Item& parent, uint32_t size);
 
   // Given an existing item, allocate a new one for the
   // existing one to later be moved into.
@@ -1609,7 +1596,7 @@ class CacheAllocator : public CacheBase {
   // @param newParent the new parent for the chain
   //
   // @throw if any of the conditions for parent or newParent are not met.
-  void transferChainLocked(WriteHandle& parent, WriteHandle& newParent);
+  void transferChainLocked(Item& parent, WriteHandle& newParent);
 
   // replace a chained item in the existing chain. This needs to be called
   // with the chained item lock held exclusive
@@ -1623,6 +1610,24 @@ class CacheAllocator : public CacheBase {
                                        WriteHandle newItemHdl,
                                        const Item& parent);
 
+  //
+  // Performs the actual inplace replace - it is called from
+  // moveChainedItem and replaceChainedItemLocked
+  // must hold chainedItemLock
+  //
+  // @param oldItem  the item we are replacing in the chain
+  // @param newItem  the item we are replacing it with
+  // @param parent   the parent for the chain
+  // @param fromMove used to determine if the replaced was called from
+  //                 moveChainedItem - we avoid the handle destructor
+  //                 in this case.
+  //
+  // @return handle to the oldItem
+  void replaceInChainLocked(Item& oldItem,
+                            WriteHandle& newItemHdl,
+                            const Item& parent,
+                            bool fromMove);
+
   // Insert an item into MM container. The caller must hold a valid handle for
   // the item.
   //
@@ -1731,6 +1736,19 @@ class CacheAllocator : public CacheBase {
 
   using EvictionIterator = typename MMContainer::LockedIterator;
 
+  // Wakes up waiters if there are any
+  //
+  // @param item    wakes waiters that are waiting on that item
+  // @param handle  handle to pass to the waiters
+  void wakeUpWaiters(Item& item, WriteHandle handle);
+
+  // Unmarks item as moving and wakes up any waiters waiting on that item
+  //
+  // @param item    wakes waiters that are waiting on that item
+  // @param handle  handle to pass to the waiters
+  typename RefcountWithFlags::Value unmarkMovingAndWakeUpWaiters(
+      Item& item, WriteHandle handle);
+
   // Deserializer CacheAllocatorMetadata and verify the version
   //
   // @param  deserializer   Deserializer object
@@ -1844,6 +1862,11 @@ class CacheAllocator : public CacheBase {
                            Item& item,
                            util::Throttler& throttler);
 
+  // Helper function to create PutToken
+  //
+  // @return valid token if the item should be written to NVM cache.
+  typename NvmCacheT::PutToken createPutToken(Item& item);
+
   // Helper function to evict a normal item for slab release
   //
   // @return last handle for corresponding to item on success. empty handle on
@@ -2082,6 +2105,88 @@ class CacheAllocator : public CacheBase {
 
   // BEGIN private members
 
+  bool tryGetHandleWithWaitContextForMovingItem(Item& item,
+                                                WriteHandle& handle);
+
+  size_t wakeUpWaitersLocked(folly::StringPiece key, WriteHandle&& handle);
+
+  class MoveCtx {
+   public:
+    MoveCtx() {}
+
+    ~MoveCtx() {
+      // prevent any further enqueue to waiters
+      // Note: we don't need to hold locks since no one can enqueue
+      // after this point.
+      wakeUpWaiters();
+    }
+
+    // record the item handle. Upon destruction we will wake up the waiters
+    // and pass a clone of the handle to the callBack. By default we pass
+    // a null handle
+    void setItemHandle(WriteHandle _it) { it = std::move(_it); }
+
+    // enqueue a waiter into the waiter list
+    // @param  waiter       WaitContext
+    void addWaiter(std::shared_ptr<WaitContext<ReadHandle>> waiter) {
+      XDCHECK(waiter);
+      waiters.push_back(std::move(waiter));
+    }
+
+    size_t numWaiters() const { return waiters.size(); }
+
+   private:
+    // notify all pending waiters that are waiting for the fetch.
+    void wakeUpWaiters() {
+      bool refcountOverflowed = false;
+      for (auto& w : waiters) {
+        // If refcount overflowed earlier, then we will return miss to
+        // all subsequent waiters.
+        if (refcountOverflowed) {
+          w->set(WriteHandle{});
+          continue;
+        }
+
+        try {
+          w->set(it.clone());
+        } catch (const exception::RefcountOverflow&) {
+          // We'll return a miss to the user's pending read,
+          // so we should enqueue a delete via NvmCache.
+          // TODO: cache.remove(it);
+          refcountOverflowed = true;
+        }
+      }
+    }
+
+    WriteHandle it; // will be set when Context is being filled
+    std::vector<std::shared_ptr<WaitContext<ReadHandle>>> waiters; // list of
+                                                                   // waiters
+  };
+  using MoveMap =
+      folly::F14ValueMap<folly::StringPiece,
+                         std::unique_ptr<MoveCtx>,
+                         folly::HeterogeneousAccessHash<folly::StringPiece>>;
+
+  static size_t getShardForKey(folly::StringPiece key) {
+    return folly::Hash()(key) % kShards;
+  }
+
+  MoveMap& getMoveMapForShard(size_t shard) {
+    return movesMap_[shard].movesMap_;
+  }
+
+  MoveMap& getMoveMap(folly::StringPiece key) {
+    return getMoveMapForShard(getShardForKey(key));
+  }
+
+  std::unique_lock<std::mutex> getMoveLockForShard(size_t shard) {
+    return std::unique_lock<std::mutex>(moveLock_[shard].moveLock_);
+  }
+
+  std::unique_lock<std::mutex> getMoveLock(folly::StringPiece key) {
+    return getMoveLockForShard(getShardForKey(key));
+  }
+
   // Whether the memory allocator for this cache allocator was created on shared
   // memory. The hash table, chained item hash table etc is also created on
   // shared memory except for temporary shared memory mode when they're created
@@ -2175,6 +2280,22 @@ class CacheAllocator : public CacheBase {
   // poolResizer_, poolOptimizer_, memMonitor_, reaper_
   mutable std::mutex workersMutex_;
 
+  static constexpr size_t kShards = 8192; // TODO: need to define right value
+
+  struct MovesMapShard {
+    alignas(folly::hardware_destructive_interference_size) MoveMap movesMap_;
+  };
+
+  struct MoveLock {
+    alignas(folly::hardware_destructive_interference_size) std::mutex moveLock_;
+  };
+
+  // a map of all pending moves
+  std::vector<MovesMapShard> movesMap_;
+
+  // a map of move locks for each shard
+  std::vector<MoveLock> moveLock_;
+
   // time when the ram cache was first created
   const uint32_t cacheCreationTime_{0};
 
diff --git a/cachelib/allocator/CacheItem.h b/cachelib/allocator/CacheItem.h
index afee315cbb..4c32ece794 100644
--- a/cachelib/allocator/CacheItem.h
+++ b/cachelib/allocator/CacheItem.h
@@ -309,7 +309,7 @@ class CACHELIB_PACKED_ATTR CacheItem {
   //
   // @return true on success, failure if item is marked as exclusive
   // @throw exception::RefcountOverflow on ref count overflow
-  FOLLY_ALWAYS_INLINE bool incRef() {
+  FOLLY_ALWAYS_INLINE RefcountWithFlags::incResult incRef() {
     try {
       return ref_.incRef();
     } catch (exception::RefcountOverflow& e) {
diff --git a/cachelib/allocator/MM2Q-inl.h b/cachelib/allocator/MM2Q-inl.h
index ba388d40a4..27f6ee3578 100644
--- a/cachelib/allocator/MM2Q-inl.h
+++ b/cachelib/allocator/MM2Q-inl.h
@@ -258,6 +258,12 @@ void MM2Q::Container<T, HookPtr>::withEvictionIterator(F&& fun) {
   }
 }
 
+template <typename T, MM2Q::Hook<T> T::*HookPtr>
+template <typename F>
+void MM2Q::Container<T, HookPtr>::withContainerLock(F&& fun) {
+  lruMutex_->lock_combine([this, &fun]() { fun(); });
+}
+
 template <typename T, MM2Q::Hook<T> T::*HookPtr>
 void MM2Q::Container<T, HookPtr>::removeLocked(T& node,
                                                bool doRebalance) noexcept {
diff --git a/cachelib/allocator/MM2Q.h b/cachelib/allocator/MM2Q.h
index 982eca21f9..69a53963a2 100644
--- a/cachelib/allocator/MM2Q.h
+++ b/cachelib/allocator/MM2Q.h
@@ -502,6 +502,10 @@ class MM2Q {
     template <typename F>
     void withEvictionIterator(F&& f);
 
+    // Execute provided function under container lock.
+    template <typename F>
+    void withContainerLock(F&& f);
+
     // get the current config as a copy
     Config getConfig() const;
 
diff --git a/cachelib/allocator/MMLru-inl.h b/cachelib/allocator/MMLru-inl.h
index d35759f212..e0ebeafc46 100644
--- a/cachelib/allocator/MMLru-inl.h
+++ b/cachelib/allocator/MMLru-inl.h
@@ -229,6 +229,12 @@ void MMLru::Container<T, HookPtr>::withEvictionIterator(F&& fun) {
   }
 }
 
+template <typename T, MMLru::Hook<T> T::*HookPtr>
+template <typename F>
+void MMLru::Container<T, HookPtr>::withContainerLock(F&& fun) {
+  lruMutex_->lock_combine([this, &fun]() { fun(); });
+}
+
 template <typename T, MMLru::Hook<T> T::*HookPtr>
 void MMLru::Container<T, HookPtr>::ensureNotInsertionPoint(T& node) noexcept {
   // If we are removing the insertion point node, grow tail before we remove
diff --git a/cachelib/allocator/MMLru.h b/cachelib/allocator/MMLru.h
index 29c6d02689..42c8a38506 100644
--- a/cachelib/allocator/MMLru.h
+++ b/cachelib/allocator/MMLru.h
@@ -376,6 +376,10 @@ class MMLru {
     template <typename F>
     void withEvictionIterator(F&& f);
 
+    // Execute provided function under container lock.
+    template <typename F>
+    void withContainerLock(F&& f);
+
     // get copy of current config
     Config getConfig() const;
 
diff --git a/cachelib/allocator/MMTinyLFU-inl.h b/cachelib/allocator/MMTinyLFU-inl.h
index 46640b24ca..597ce8de70 100644
--- a/cachelib/allocator/MMTinyLFU-inl.h
+++ b/cachelib/allocator/MMTinyLFU-inl.h
@@ -227,6 +227,13 @@ void MMTinyLFU::Container<T, HookPtr>::withEvictionIterator(F&& fun) {
   fun(getEvictionIterator());
 }
 
+template <typename T, MMTinyLFU::Hook<T> T::*HookPtr>
+template <typename F>
+void MMTinyLFU::Container<T, HookPtr>::withContainerLock(F&& fun) {
+  LockHolder l(lruMutex_);
+  fun();
+}
+
 template <typename T, MMTinyLFU::Hook<T> T::*HookPtr>
 void MMTinyLFU::Container<T, HookPtr>::removeLocked(T& node) noexcept {
   if (isTiny(node)) {
diff --git a/cachelib/allocator/MMTinyLFU.h b/cachelib/allocator/MMTinyLFU.h
index c8f2699264..81099352d7 100644
--- a/cachelib/allocator/MMTinyLFU.h
+++ b/cachelib/allocator/MMTinyLFU.h
@@ -497,6 +497,10 @@ class MMTinyLFU {
     template <typename F>
     void withEvictionIterator(F&& f);
 
+    // Execute provided function under container lock.
+    template <typename F>
+    void withContainerLock(F&& f);
+
     // for saving the state of the lru
     //
     // precondition:  serialization must happen without any reader or writer
diff --git a/cachelib/allocator/Refcount.h b/cachelib/allocator/Refcount.h
index 107e10735e..e139c51951 100644
--- a/cachelib/allocator/Refcount.h
+++ b/cachelib/allocator/Refcount.h
@@ -130,30 +130,35 @@ class FOLLY_PACK_ATTR RefcountWithFlags {
   RefcountWithFlags& operator=(const RefcountWithFlags&) = delete;
   RefcountWithFlags(RefcountWithFlags&&) = delete;
   RefcountWithFlags& operator=(RefcountWithFlags&&) = delete;
-
+  enum incResult { incOk, incFailedMoving, incFailedEviction };
   // Bumps up the reference count only if the new count will be strictly less
   // than or equal to the maxCount and the item is not exclusive
   // @return true if refcount is bumped. false otherwise (if item is exclusive)
   // @throw  exception::RefcountOverflow if new count would be greater than
   // maxCount
-  FOLLY_ALWAYS_INLINE bool incRef() {
-    auto predicate = [](const Value curValue) {
+  FOLLY_ALWAYS_INLINE incResult incRef() {
+    incResult res = incOk;
+    auto predicate = [&res](const Value curValue) {
       Value bitMask = getAdminRef<kExclusive>();
 
       const bool exlusiveBitIsSet = curValue & bitMask;
       if (UNLIKELY((curValue & kAccessRefMask) == (kAccessRefMask))) {
         throw exception::RefcountOverflow("Refcount maxed out.");
+      } else if (exlusiveBitIsSet) {
+        res = (curValue & kAccessRefMask) == 0 ? incFailedEviction
+                                               : incFailedMoving;
+        return false;
       }
-
-      // Check if the item is not marked for eviction
-      return !exlusiveBitIsSet || ((curValue & kAccessRefMask) != 0);
+      res = incOk;
+      return true;
     };
 
     auto newValue = [](const Value curValue) {
       return (curValue + static_cast<Value>(1));
     };
 
-    return atomicUpdateValue(predicate, newValue);
+    atomicUpdateValue(predicate, newValue);
+    return res;
   }
 
   // Bumps down the reference count
@@ -322,11 +327,19 @@ class FOLLY_PACK_ATTR RefcountWithFlags {
   bool markMoving() {
     Value linkedBitMask = getAdminRef<kLinked>();
     Value exclusiveBitMask = getAdminRef<kExclusive>();
+    Value isChainedItemFlag = getFlag<kIsChainedItem>();
 
-    auto predicate = [linkedBitMask, exclusiveBitMask](const Value curValue) {
+    auto predicate = [linkedBitMask, exclusiveBitMask,
+                      isChainedItemFlag](const Value curValue) {
       const bool unlinked = !(curValue & linkedBitMask);
       const bool alreadyExclusive = curValue & exclusiveBitMask;
+      const bool isChained = curValue & isChainedItemFlag;
 
+      // chained item can have ref count == 1, this just means it's linked in
+      // the chain
+      if ((curValue & kAccessRefMask) > isChained ? 1 : 0) {
+        return false;
+      }
       if (unlinked || alreadyExclusive) {
         return false;
       }
diff --git a/cachelib/allocator/tests/AllocatorTypeTest.cpp b/cachelib/allocator/tests/AllocatorTypeTest.cpp
index 1e98af29f2..67ad259305 100644
--- a/cachelib/allocator/tests/AllocatorTypeTest.cpp
+++ b/cachelib/allocator/tests/AllocatorTypeTest.cpp
@@ -288,8 +288,8 @@ TYPED_TEST(BaseAllocatorTest, AddChainedItemMultiThreadWithMovingAndSync) {
   this->testAddChainedItemMultithreadWithMovingAndSync();
 }
 
-TYPED_TEST(BaseAllocatorTest, TransferChainWhileMoving) {
-  this->testTransferChainWhileMoving();
+TYPED_TEST(BaseAllocatorTest, TransferChainAfterMoving) {
+  this->testTransferChainAfterMoving();
 }
 
 TYPED_TEST(BaseAllocatorTest, AddAndPopChainedItemMultithread) {
diff --git a/cachelib/allocator/tests/BaseAllocatorTest.h b/cachelib/allocator/tests/BaseAllocatorTest.h
index f503d59f61..db6bf1256c 100644
--- a/cachelib/allocator/tests/BaseAllocatorTest.h
+++ b/cachelib/allocator/tests/BaseAllocatorTest.h
@@ -3655,6 +3655,16 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
                             sourceAlloc);
       otherThread.join();
 
+      // in our new version with marking item as moving, move attempts
+      // will only fail if there is a concurrent set to that item, in
+      // this case if the handle to an item is held, the slab release
+      // will keep trying to mark the item as moving - we currently
+      // don't have a counter for that (but this test assumes that
+      // if handle is held then moveForSlabRelease will retry,
+      // that is where the move attempts counter is incremented)
+      //
+      // as a fix, we increment the move attempts counter during
+      // markMovingForSlabRelase too
       XLOG(INFO, "Number of move retry attempts: ",
            allocator.getSlabReleaseStats().numMoveAttempts);
       ASSERT_GT(allocator.getSlabReleaseStats().numMoveAttempts, 1);
@@ -5156,9 +5166,10 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
     lookupFn("yolo");
   }
 
-  // while a chained item could be moved, try to transfer its parent and
-  // validate that move succeeds correctly.
-  void testTransferChainWhileMoving() {
+  // while a chained item could be moved - it is sync on parent moving bit.
+  // try to transfer its parent after we moved and
+  // validate that transfer succeeds correctly.
+  void testTransferChainAfterMoving() {
     // create an allocator worth 10 slabs.
     typename AllocatorT::Config config;
     config.configureChainedItems();
@@ -5179,15 +5190,13 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
     struct TestSyncObj : public AllocatorT::SyncObj {
       TestSyncObj(std::mutex& m,
                   std::atomic<bool>& firstTime,
-                  folly::Baton<>& startedMoving,
-                  folly::Baton<>& changedParent)
+                  folly::Baton<>& startedMoving)
           : l(m) {
         if (!firstTime) {
           return;
         }
         firstTime = false;
         startedMoving.post();
-        changedParent.wait();
       }
 
       std::lock_guard<std::mutex> l;
@@ -5214,11 +5223,10 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
                       oldItem.getSize());
           ++numMoves;
         },
-        [&m, &startedMoving, &changedParent,
-         &firstTimeMovingSync](typename Item::Key key) {
+        [&m, &startedMoving, &firstTimeMovingSync](typename Item::Key key) {
           XLOG(ERR) << "Moving" << key;
           return std::make_unique<TestSyncObj>(m, firstTimeMovingSync,
-                                               startedMoving, changedParent);
+                                               startedMoving);
         },
         numMovingAttempts);
 
@@ -5248,24 +5256,20 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
     auto slabRelease = std::async(releaseFn);
 
     startedMoving.wait();
+    // wait for slab release to complete.
+    slabRelease.wait();
 
     // we know moving sync is held now.
     {
       auto newParent = alloc.allocate(pid, movingKey, 600);
+      // parent is marked moving during moved, once finished we will get handle
       auto parent = alloc.findToWrite(movingKey);
       alloc.transferChainAndReplace(parent, newParent);
     }
 
-    // indicate that we changed the parent. This should abort the current
-    // moving attempt, re-allocate the item and eventually succeed in moving.
-    changedParent.post();
-
-    // wait for slab release to complete.
-    slabRelease.wait();
-
     EXPECT_EQ(numMoves, 1);
     auto slabReleaseStats = alloc.getSlabReleaseStats();
-    EXPECT_EQ(slabReleaseStats.numMoveAttempts, 2);
+    EXPECT_EQ(slabReleaseStats.numMoveAttempts, 1);
     EXPECT_EQ(slabReleaseStats.numMoveSuccesses, 1);
 
     auto handle = alloc.find(movingKey);
diff --git a/cachelib/allocator/tests/RefCountTest.cpp b/cachelib/allocator/tests/RefCountTest.cpp
index 1f31894ddc..9b5d06e958 100644
--- a/cachelib/allocator/tests/RefCountTest.cpp
+++ b/cachelib/allocator/tests/RefCountTest.cpp
@@ -101,7 +101,7 @@ void RefCountTest::testBasic() {
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag1>());
 
   for (uint32_t i = 0; i < RefcountWithFlags::kAccessRefMask; i++) {
-    ASSERT_TRUE(ref.incRef());
+    ASSERT_EQ(ref.incRef(), RefcountWithFlags::incOk);
   }
 
   // Incrementing past the max will fail
@@ -215,17 +215,13 @@ void RefCountTest::testMarkForEvictionAndMoving() {
   }
 
   {
-    // can mark moving when ref count > 0
+    // cannot mark moving when ref count > 0
     RefcountWithFlags ref;
     ref.markInMMContainer();
 
     ref.incRef();
 
-    ASSERT_TRUE(ref.markMoving());
-
-    ref.unmarkInMMContainer();
-    auto ret = ref.unmarkMoving();
-    ASSERT_EQ(ret, 1);
+    ASSERT_FALSE(ref.markMoving());
   }
 
   {
diff --git a/cachelib/common/Mutex.h b/cachelib/common/Mutex.h
index 1d6e5898f1..15b440d406 100644
--- a/cachelib/common/Mutex.h
+++ b/cachelib/common/Mutex.h
@@ -341,6 +341,7 @@ class RWBucketLocks : public BaseBucketLocks<LockType, LockAlignmentType> {
   using Lock = LockType;
   using ReadLockHolder = ReadLockHolderType;
   using WriteLockHolder = WriteLockHolderType;
+  using LockHolder = std::unique_lock<Lock>;
 
   RWBucketLocks(uint32_t locksPower, std::shared_ptr<Hash> hasher)
       : Base::BaseBucketLocks(locksPower, std::move(hasher)) {}
@@ -357,6 +358,11 @@ class RWBucketLocks : public BaseBucketLocks<LockType, LockAlignmentType> {
     return WriteLockHolder{Base::getLock(args...)};
   }
 
+  template <typename... Args>
+  LockHolder tryLockExclusive(Args... args) noexcept {
+    return LockHolder(Base::getLock(args...), std::try_to_lock);
+  }
+
   // try to grab the reader lock for a limit _timeout_ duration
   template <typename... Args>
   ReadLockHolder lockShared(const std::chrono::microseconds& timeout,