diff --git a/src/cluster_slot_stats.c b/src/cluster_slot_stats.c
index b52692bd15..997f3c646e 100644
--- a/src/cluster_slot_stats.c
+++ b/src/cluster_slot_stats.c
@@ -132,22 +132,27 @@ static void addReplySortedSlotStats(client *c, slotStatForSort slot_stats[], lon
     }
 }
 
-static int canAddNetworkBytesOut(client *c) {
-    return server.cluster_slot_stats_enabled && server.cluster_enabled && c->slot != -1;
+static int canAddNetworkBytesOut(int slot) {
+    return clusterSlotStatsEnabled() && slot != -1;
+}
+
+/* Accumulates egress bytes for the slot. */
+void clusterSlotStatsAddNetworkBytesOutForSlot(int slot, unsigned long long net_bytes_out) {
+    if (!canAddNetworkBytesOut(slot)) return;
+
+    serverAssert(slot >= 0 && slot < CLUSTER_SLOTS);
+    server.cluster->slot_stats[slot].network_bytes_out += net_bytes_out;
 }
 
 /* Accumulates egress bytes upon sending RESP responses back to user clients. */
 void clusterSlotStatsAddNetworkBytesOutForUserClient(client *c) {
-    if (!canAddNetworkBytesOut(c)) return;
-
-    serverAssert(c->slot >= 0 && c->slot < CLUSTER_SLOTS);
-    server.cluster->slot_stats[c->slot].network_bytes_out += c->net_output_bytes_curr_cmd;
+    clusterSlotStatsAddNetworkBytesOutForSlot(c->slot, c->net_output_bytes_curr_cmd);
 }
 
 /* Accumulates egress bytes upon sending replication stream. This only applies for primary nodes. */
 static void clusterSlotStatsUpdateNetworkBytesOutForReplication(long long len) {
     client *c = server.current_client;
-    if (c == NULL || !canAddNetworkBytesOut(c)) return;
+    if (c == NULL || !canAddNetworkBytesOut(c->slot)) return;
 
     serverAssert(c->slot >= 0 && c->slot < CLUSTER_SLOTS);
     serverAssert(nodeIsPrimary(server.cluster->myself));
@@ -174,24 +179,14 @@ void clusterSlotStatsDecrNetworkBytesOutForReplication(long long len) {
  *    This type is not aggregated, to stay consistent with server.stat_net_output_bytes aggregation.
  * This function covers the internal propagation component. */
 void clusterSlotStatsAddNetworkBytesOutForShardedPubSubInternalPropagation(client *c, int slot) {
-    /* For a blocked client, c->slot could be pre-filled.
-     * Thus c->slot is backed-up for restoration after aggregation is completed. */
-    int _slot = c->slot;
-    c->slot = slot;
-    if (!canAddNetworkBytesOut(c)) {
-        /* c->slot should not change as a side effect of this function,
-         * regardless of the function's early return condition. */
-        c->slot = _slot;
-        return;
-    }
+    if (!canAddNetworkBytesOut(slot)) return;
 
-    serverAssert(c->slot >= 0 && c->slot < CLUSTER_SLOTS);
-    server.cluster->slot_stats[c->slot].network_bytes_out += c->net_output_bytes_curr_cmd;
+    serverAssert(slot >= 0 && slot < CLUSTER_SLOTS);
+    server.cluster->slot_stats[slot].network_bytes_out += c->net_output_bytes_curr_cmd;
 
     /* For sharded pubsub, the client's network bytes metrics must be reset here,
      * as resetClient() is not called until subscription ends. */
     c->net_output_bytes_curr_cmd = 0;
-    c->slot = _slot;
 }
 
 /* Adds reply for the ORDERBY variant.
@@ -219,8 +214,7 @@ void clusterSlotStatResetAll(void) {
  * would equate to repeating the same calculation twice.
  */
 static int canAddCpuDuration(client *c) {
-    return server.cluster_slot_stats_enabled &&  /* Config should be enabled. */
-           server.cluster_enabled &&             /* Cluster mode should be enabled. */
+    return clusterSlotStatsEnabled() &&
            c->slot != -1 &&                      /* Command should be slot specific. */
            (!server.execution_nesting ||         /* Either; */
             (server.execution_nesting &&         /* 1) Command should not be nested, or */
@@ -248,7 +242,7 @@ static int canAddNetworkBytesIn(client *c) {
      * Third, blocked client is not aggregated, to avoid duplicate aggregation upon unblocking.
      * Fourth, the server is not under a MULTI/EXEC transaction, to avoid duplicate aggregation of
      * EXEC's 14 bytes RESP upon nested call()'s afterCommand(). */
-    return server.cluster_enabled && server.cluster_slot_stats_enabled && c->slot != -1 && !(c->flag.blocked) &&
+    return clusterSlotStatsEnabled() && c->slot != -1 && !(c->flag.blocked) &&
            !server.in_exec;
 }
 
@@ -343,3 +337,8 @@ void clusterSlotStatsCommand(client *c) {
         addReplySubcommandSyntaxError(c);
     }
 }
+
+int clusterSlotStatsEnabled(void) {
+        return server.cluster_slot_stats_enabled &&  /* Config should be enabled. */
+               server.cluster_enabled;               /* Cluster mode should be enabled. */
+}
diff --git a/src/cluster_slot_stats.h b/src/cluster_slot_stats.h
index 2e9da70aae..3a78fa309f 100644
--- a/src/cluster_slot_stats.h
+++ b/src/cluster_slot_stats.h
@@ -6,6 +6,7 @@
 /* General use-cases. */
 void clusterSlotStatReset(int slot);
 void clusterSlotStatResetAll(void);
+int clusterSlotStatsEnabled(void);
 
 /* cpu-usec metric. */
 void clusterSlotStatsAddCpuDuration(client *c, ustime_t duration);
@@ -17,6 +18,7 @@ void clusterSlotStatsSetClusterMsgLength(uint32_t len);
 void clusterSlotStatsResetClusterMsgLength(void);
 
 /* network-bytes-out metric. */
+void clusterSlotStatsAddNetworkBytesOutForSlot(int slot, unsigned long long net_bytes_out);
 void clusterSlotStatsAddNetworkBytesOutForUserClient(client *c);
 void clusterSlotStatsIncrNetworkBytesOutForReplication(long long len);
 void clusterSlotStatsDecrNetworkBytesOutForReplication(long long len);
diff --git a/src/config.c b/src/config.c
index cc0f8d2dd8..3f7d970ae0 100644
--- a/src/config.c
+++ b/src/config.c
@@ -3206,6 +3206,7 @@ standardConfig static_configs[] = {
     createBoolConfig("cluster-slot-stats-enabled", NULL, MODIFIABLE_CONFIG, server.cluster_slot_stats_enabled, 0, NULL, NULL),
     createBoolConfig("hide-user-data-from-log", NULL, MODIFIABLE_CONFIG, server.hide_user_data_from_log, 1, NULL, NULL),
     createBoolConfig("import-mode", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.import_mode, 0, NULL, NULL),
+    createBoolConfig("reply-offload", NULL, MODIFIABLE_CONFIG, server.reply_offload_enabled, 0, NULL, NULL),
 
     /* String Configs */
     createStringConfig("aclfile", NULL, IMMUTABLE_CONFIG, ALLOW_EMPTY_STRING, server.acl_filename, "", NULL, NULL),
diff --git a/src/io_threads.c b/src/io_threads.c
index 3865eb77c3..849a4c1f2a 100644
--- a/src/io_threads.c
+++ b/src/io_threads.c
@@ -397,9 +397,13 @@ int trySendWriteToIOThreads(client *c) {
      * threads from reading data that might be invalid in their local CPU cache. */
     c->io_last_reply_block = listLast(c->reply);
     if (c->io_last_reply_block) {
-        c->io_last_bufpos = ((clientReplyBlock *)listNodeValue(c->io_last_reply_block))->used;
+        clientReplyBlock *block = (clientReplyBlock *)listNodeValue(c->io_last_reply_block);
+        c->io_last_bufpos = block->used;
+        /* If reply offload enabled force new header */
+        block->last_header = NULL;
     } else {
         c->io_last_bufpos = (size_t)c->bufpos;
+        c->last_header = NULL;
     }
     serverAssert(c->bufpos > 0 || c->io_last_bufpos > 0);
 
diff --git a/src/memory_prefetch.c b/src/memory_prefetch.c
index ef6a6c6d02..7726749ad0 100644
--- a/src/memory_prefetch.c
+++ b/src/memory_prefetch.c
@@ -119,6 +119,9 @@ static void prefetchEntry(KeyPrefetchInfo *info) {
     if (hashtableIncrementalFindStep(&info->hashtab_state) == 1) {
         /* Not done yet */
         moveToNextKey();
+        /* If reply offload enabled no need to prefetch value because main thread will not access it */
+    } else if (server.reply_offload_enabled) {
+         markKeyAsdone(info);
     } else {
         info->state = PREFETCH_VALUE;
     }
diff --git a/src/networking.c b/src/networking.c
index 4d386d6dc4..0cc61ff9d2 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -40,8 +40,23 @@
 #include <sys/uio.h>
 #include <math.h>
 #include <ctype.h>
+#include <stdbool.h>
 #include <stdatomic.h>
 
+typedef enum {
+    CLIENT_REPLY_PAYLOAD_DATA = 0,
+    CLIENT_REPLY_PAYLOAD_BULK_OFFLOAD,
+} clientReplyPayloadType;
+
+/* Reply payload header */
+typedef struct __attribute__((__packed__)) payloadHeader {
+    size_t len;         /* payload length in a reply buffer */
+    size_t actual_len;  /* actual reply length after offload expanding */
+    uint8_t type;       /* one of clientReplyPayloadType */
+    int16_t slot;       /* to report network-bytes-out for offloads */
+
+} payloadHeader;
+
 static void setProtocolError(const char *errstr, client *c);
 static void pauseClientsByClient(mstime_t end, int isPauseClientAll);
 int postponeClientRead(client *c);
@@ -122,6 +137,24 @@ static inline int isReplicaReadyForReplData(client *replica) {
            !(replica->flag.close_asap);
 }
 
+/*
+ *  Reply offload can be allowed only for regular Valkey clients
+ *  that use _writeToClient handler to write replies to client connection
+ */
+static bool isReplyOffloadAllowable(client *c) {
+    if (c->flag.fake) {
+        return false;
+    }
+
+    switch (getClientType(c)) {
+        case CLIENT_TYPE_NORMAL:
+        case CLIENT_TYPE_PUBSUB:
+            return true;
+        default:
+            return false;
+    }
+}
+
 client *createClient(connection *conn) {
     client *c = zmalloc(sizeof(client));
 
@@ -150,6 +183,7 @@ client *createClient(connection *conn) {
     c->lib_name = NULL;
     c->lib_ver = NULL;
     c->bufpos = 0;
+    c->last_header = NULL;
     c->buf_peak = c->buf_usable_size;
     c->buf_peak_last_reset_time = server.unixtime;
     c->ref_repl_buf_node = NULL;
@@ -171,7 +205,6 @@ client *createClient(connection *conn) {
     c->cur_script = NULL;
     c->multibulklen = 0;
     c->bulklen = -1;
-    c->sentlen = 0;
     c->raw_flag = 0;
     c->capa = 0;
     c->slot = -1;
@@ -234,6 +267,9 @@ client *createClient(connection *conn) {
     c->commands_processed = 0;
     c->io_last_reply_block = NULL;
     c->io_last_bufpos = 0;
+    c->io_last_written_buf = NULL;
+    c->io_last_written_bufpos = 0;
+    c->io_last_written_data_len = 0;
     return c;
 }
 
@@ -276,6 +312,18 @@ void putClientInPendingWriteQueue(client *c) {
     }
 }
 
+/*
+ * Activate/deactivate reply offload for the client
+ * according to server config
+ */
+static void updateReplyOffloadFlag(client *c) {
+    if (server.reply_offload_enabled && !c->flag.reply_offload && isReplyOffloadAllowable(c)) {
+        c->flag.reply_offload = 1;
+    } else if (!server.reply_offload_enabled && c->flag.reply_offload) {
+        c->flag.reply_offload = 0;
+    }
+}
+
 /* This function is called every time we are going to transmit new data
  * to the client. The behavior is the following:
  *
@@ -322,7 +370,11 @@ int prepareClientToWrite(client *c) {
 
     /* Schedule the client to write the output buffers to the socket, unless
      * it should already be setup to do so (it has already pending data). */
-    if (!clientHasPendingReplies(c)) putClientInPendingWriteQueue(c);
+    if (!clientHasPendingReplies(c)) {
+        /* We can change reply offload mode for the client only when its reply buffers are empty. */
+        updateReplyOffloadFlag(c);
+        putClientInPendingWriteQueue(c);
+    }
 
     /* Authorize the caller to queue in the output buffer of this client. */
     return C_OK;
@@ -374,6 +426,49 @@ void deleteCachedResponseClient(client *recording_client) {
 /* -----------------------------------------------------------------------------
  * Low level functions to add more data to output buffers.
  * -------------------------------------------------------------------------- */
+static inline void insertPayloadHeader(char *buf, size_t *bufpos, uint8_t type, size_t len, int slot, payloadHeader **last_header) {
+    /* Save the latest header */
+    *last_header = (payloadHeader *)(buf + *bufpos);
+
+    (*last_header)->type = type;
+    (*last_header)->len = len;
+    (*last_header)->slot = slot;
+    (*last_header)->actual_len = 0;
+
+    *bufpos += sizeof(payloadHeader);
+}
+
+static inline int updatePayloadHeader(payloadHeader *last_header, uint8_t type, size_t len, int slot) {
+    if (last_header->type == type && last_header->slot == slot) {
+        last_header->len += len;
+        return C_OK;
+    }
+    return C_ERR;
+}
+
+static size_t upsertPayloadHeader(char *buf, size_t *bufpos, payloadHeader **last_header, uint8_t type, size_t len, int slot, size_t available) {
+    /* Enforce min len for offloads as whole pointers must be written to the buffer */
+    size_t min_len = (type == CLIENT_REPLY_PAYLOAD_BULK_OFFLOAD ? len : 1);
+    if (min_len > available) return 0;
+    size_t reply_len = min(available, len);
+
+    // If cluster slots stats disabled set slot to -1 to prevent excessive per slot headers
+    if (!clusterSlotStatsEnabled()) slot = -1;
+
+    /* Try to add payload to last chunk if possible */
+    if (*last_header != NULL) {
+        if (updatePayloadHeader(*last_header, type, reply_len, slot) == C_OK) return reply_len;
+    }
+
+    /* Recheck min len condition and recalcuate allowed len with a new header to be added */
+    if (sizeof(payloadHeader) + min_len > available) return 0;
+    available -= sizeof(payloadHeader);
+    if (len > available) reply_len = available;
+
+    /* Start a new payload chunk */
+    insertPayloadHeader(buf, bufpos,  type, reply_len, slot, last_header);
+    return reply_len;
+}
 
 /* Attempts to add the reply to the static buffer in the client struct.
  * Returns the length of data that is added to the reply buffer.
@@ -382,24 +477,36 @@ void deleteCachedResponseClient(client *recording_client) {
  * zmalloc_usable_size() call. Writing beyond client->buf boundaries confuses
  * sanitizer and generates a false positive out-of-bounds error */
 VALKEY_NO_SANITIZE("bounds")
-size_t _addReplyToBuffer(client *c, const char *s, size_t len) {
-    size_t available = c->buf_usable_size - c->bufpos;
-
+static size_t _addReplyPayloadToBuffer(client *c, const void *payload, size_t len, uint8_t payload_type) {
     /* If there already are entries in the reply list, we cannot
      * add anything more to the static buffer. */
     if (listLength(c->reply) > 0) return 0;
 
-    size_t reply_len = len > available ? available : len;
-    memcpy(c->buf + c->bufpos, s, reply_len);
+    size_t available = c->buf_usable_size - c->bufpos;
+    size_t reply_len = min(available, len);
+    if (c->flag.reply_offload) {
+        reply_len = upsertPayloadHeader(c->buf, &c->bufpos, &c->last_header, payload_type, len, c->slot, available);
+    }
+    if (!reply_len) return 0;
+
+    memcpy(c->buf + c->bufpos, payload, reply_len);
     c->bufpos += reply_len;
     /* We update the buffer peak after appending the reply to the buffer */
     if (c->buf_peak < (size_t)c->bufpos) c->buf_peak = (size_t)c->bufpos;
     return reply_len;
 }
 
+size_t _addReplyToBuffer(client *c, const char *s, size_t len) {
+    return _addReplyPayloadToBuffer(c, s, len, CLIENT_REPLY_PAYLOAD_DATA);
+}
+
+size_t _addBulkOffloadToBuffer(client *c, robj *obj) {
+    return _addReplyPayloadToBuffer(c, &obj, sizeof(void*), CLIENT_REPLY_PAYLOAD_BULK_OFFLOAD);
+}
+
 /* Adds the reply to the reply linked list.
  * Note: some edits to this function need to be relayed to AddReplyFromClient. */
-void _addReplyProtoToList(client *c, list *reply_list, const char *s, size_t len) {
+static void _addReplyPayloadToList(client *c, list *reply_list, const char *payload, size_t len, uint8_t payload_type) {
     listNode *ln = listLast(reply_list);
     clientReplyBlock *tail = ln ? listNodeValue(ln) : NULL;
 
@@ -413,21 +520,32 @@ void _addReplyProtoToList(client *c, list *reply_list, const char *s, size_t len
          * new node */
         size_t avail = tail->size - tail->used;
         size_t copy = avail >= len ? len : avail;
-        memcpy(tail->buf + tail->used, s, copy);
-        tail->used += copy;
-        s += copy;
-        len -= copy;
+        if (c->flag.reply_offload) {
+            copy = upsertPayloadHeader(tail->buf, &tail->used, &tail->last_header, payload_type, len, c->slot, avail);
+        }
+        if (copy) {
+            memcpy(tail->buf + tail->used, payload, copy);
+            tail->used += copy;
+            payload += copy;
+            len -= copy;
+        }
     }
     if (len) {
         /* Create a new node, make sure it is allocated to at
          * least PROTO_REPLY_CHUNK_BYTES */
         size_t usable_size;
-        size_t size = len < PROTO_REPLY_CHUNK_BYTES ? PROTO_REPLY_CHUNK_BYTES : len;
+        size_t required_size = c->flag.reply_offload ? len + sizeof(payloadHeader) : len;
+        size_t size = required_size < PROTO_REPLY_CHUNK_BYTES ? PROTO_REPLY_CHUNK_BYTES : required_size;
         tail = zmalloc_usable(size + sizeof(clientReplyBlock), &usable_size);
         /* take over the allocation's internal fragmentation */
         tail->size = usable_size - sizeof(clientReplyBlock);
-        tail->used = len;
-        memcpy(tail->buf, s, len);
+        tail->used = 0;
+        tail->last_header = NULL;
+        if (c->flag.reply_offload) {
+            upsertPayloadHeader(tail->buf, &tail->used, &tail->last_header, payload_type, len, c->slot, tail->size);
+        }
+        memcpy(tail->buf + tail->used, payload, len);
+        tail->used += len;
         listAddNodeTail(reply_list, tail);
         c->reply_bytes += tail->size;
 
@@ -435,6 +553,14 @@ void _addReplyProtoToList(client *c, list *reply_list, const char *s, size_t len
     }
 }
 
+void _addReplyProtoToList(client *c, list *reply_list, const char *s, size_t len) {
+    _addReplyPayloadToList(c, reply_list, s, len, CLIENT_REPLY_PAYLOAD_DATA);
+}
+
+void _addBulkOffloadToList(client *c, robj *obj) {
+    _addReplyPayloadToList(c, c->reply, (char*) &obj, sizeof(void*), CLIENT_REPLY_PAYLOAD_BULK_OFFLOAD);
+}
+
 /* The subscribe / unsubscribe command family has a push as a reply,
  * or in other words, it responds with a push (or several of them
  * depending on how many arguments it got), and has no reply. */
@@ -480,6 +606,17 @@ void _addReplyToBufferOrList(client *c, const char *s, size_t len) {
     if (len > reply_len) _addReplyProtoToList(c, c->reply, s + reply_len, len - reply_len);
 }
 
+void _addBulkOffloadToBufferOrList(client *c, robj *obj) {
+    if (c->flag.close_after_reply) return;
+
+    /* Refcount will be decremented in post write handler (i.e. in _postWriteToClient)  */
+    incrRefCount(obj);
+
+    if (!_addBulkOffloadToBuffer(c, obj)) {
+        _addBulkOffloadToList(c, obj);
+    }
+}
+
 /* -----------------------------------------------------------------------------
  * Higher level functions to queue data on the client output buffer.
  * The following functions are the ones that commands implementations will call.
@@ -772,7 +909,7 @@ void trimReplyUnusedTailSpace(client *c) {
      * Also, to avoid large memmove which happens as part of realloc, we only do
      * that if the used part is small.  */
     if (tail->size - tail->used > tail->size / 4 && tail->used < PROTO_REPLY_CHUNK_BYTES &&
-        c->io_write_state != CLIENT_PENDING_IO) {
+        c->io_write_state != CLIENT_PENDING_IO && !c->flag.reply_offload) {
         size_t usable_size;
         size_t old_size = tail->size;
         tail = zrealloc_usable(tail, tail->used + sizeof(clientReplyBlock), &usable_size);
@@ -834,7 +971,7 @@ void setDeferredReply(client *c, void *node, const char *s, size_t length) {
      * - And not too large (avoid large memmove)
      * - And the client is not in a pending I/O state */
     if (ln->prev != NULL && (prev = listNodeValue(ln->prev)) && prev->size - prev->used > 0 &&
-        c->io_write_state != CLIENT_PENDING_IO) {
+        c->io_write_state != CLIENT_PENDING_IO && !c->flag.reply_offload) {
         size_t len_to_copy = prev->size - prev->used;
         if (len_to_copy > length) len_to_copy = length;
         memcpy(prev->buf + prev->used, s, len_to_copy);
@@ -848,7 +985,7 @@ void setDeferredReply(client *c, void *node, const char *s, size_t length) {
     }
 
     if (ln->next != NULL && (next = listNodeValue(ln->next)) && next->size - next->used >= length &&
-        next->used < PROTO_REPLY_CHUNK_BYTES * 4 && c->io_write_state != CLIENT_PENDING_IO) {
+        next->used < PROTO_REPLY_CHUNK_BYTES * 4 && c->io_write_state != CLIENT_PENDING_IO && !c->flag.reply_offload) {
         memmove(next->buf + length, next->buf, next->used);
         memcpy(next->buf, s, length);
         next->used += length;
@@ -856,11 +993,18 @@ void setDeferredReply(client *c, void *node, const char *s, size_t length) {
     } else {
         /* Create a new node */
         size_t usable_size;
-        clientReplyBlock *buf = zmalloc_usable(length + sizeof(clientReplyBlock), &usable_size);
+        size_t required_size = c->flag.reply_offload ? length + sizeof(payloadHeader) : length;
+        clientReplyBlock *buf = zmalloc_usable(required_size + sizeof(clientReplyBlock), &usable_size);
         /* Take over the allocation's internal fragmentation */
         buf->size = usable_size - sizeof(clientReplyBlock);
-        buf->used = length;
-        memcpy(buf->buf, s, length);
+        buf->used = 0;
+        buf->last_header = 0;
+        if (c->flag.reply_offload) {
+            upsertPayloadHeader(buf->buf, &buf->used, &buf->last_header, CLIENT_REPLY_PAYLOAD_DATA, length, c->slot, buf->size);
+        }
+        memcpy(buf->buf + buf->used, s, length);
+        buf->used += length;
+
         listNodeValue(ln) = buf;
         c->reply_bytes += buf->size;
 
@@ -1123,8 +1267,20 @@ void addReplyBulkLen(client *c, robj *obj) {
     _addReplyLongLongWithPrefix(c, len, '$');
 }
 
+int tryOffloadBulkReply(client *c, robj *obj) {
+    if (!c->flag.reply_offload) return C_ERR;
+    if (obj->encoding != OBJ_ENCODING_RAW) return C_ERR;
+    if (obj->refcount == OBJ_STATIC_REFCOUNT) return C_ERR;
+    if (prepareClientToWrite(c) != C_OK) return C_ERR;
+
+    _addBulkOffloadToBufferOrList(c, obj);
+
+    return C_OK;
+}
+
 /* Add an Object as a bulk reply */
 void addReplyBulk(client *c, robj *obj) {
+    if (tryOffloadBulkReply(c, obj) == C_OK) return;
     addReplyBulkLen(c, obj);
     addReply(c, obj);
     addReplyProto(c, "\r\n", 2);
@@ -1763,6 +1919,7 @@ void freeClient(client *c) {
     c->pubsubshard_channels = NULL;
 
     /* Free data structures. */
+    if (c->flag.reply_offload) releaseReplyOffloads(c);
     listRelease(c->reply);
     c->reply = NULL;
     zfree_with_size(c->buf, c->buf_usable_size);
@@ -2039,19 +2196,215 @@ void writeToReplica(client *c) {
     }
 }
 
+/*
+ *  The replyIOV struct is used by writevToClient to prepare iovec array
+ *  for submitting to connWritev
+ */
+typedef struct replyIOV {
+    int cnt;
+    int max;
+    struct iovec *iov;
+    ssize_t iov_len_total;   /* Total length of data pointed by iov array */
+    size_t last_written_len; /* Length of data in the last written buffer
+                              * partially written in previous writevToClient invocation */
+    int limit_reached;       /* Non zero if either max iov count or NET_MAX_WRITES_PER_EVENT limit
+                              * reached during iov prepearation */
+    int offload_active;
+    int prfxcnt;             /* prfxcnt, prefixes and clrf are auxiliary fields
+                              * for expanding reply offloads */
+    char (*prefixes)[LONG_STR_SIZE + 3];
+    char *crlf;
+} replyIOV;
+
+/*
+ *  The bufWriteMetadata struct is used by writevToClient to record metadata
+ *  about client reply buffers submitted to connWritev
+ */
+typedef struct bufWriteMetadata {
+    char *buf;
+    size_t bufpos;
+    uint64_t data_len; /* Actual bytes out, differ from bufpos in case of reply offload */
+    int complete;      /* Was the buffer completely scattered to iov or
+                          process stopped due encountered limit */
+} bufWriteMetadata;
+
+static void initReplyIOV(client *c, int iovmax, struct iovec *iov_arr, char (*prefixes)[], char *crlf, replyIOV *reply) {
+    reply->cnt = 0;
+    reply->max = iovmax;
+    reply->limit_reached = 0;
+    reply->iov = iov_arr;
+    reply->iov_len_total = 0;
+    reply->last_written_len = c->io_last_written_data_len;
+    reply->offload_active = c->flag.reply_offload;
+    if (reply->offload_active) {
+        reply->prfxcnt = 0;
+        reply->prefixes = prefixes;
+        reply->crlf = crlf;
+    }
+}
+
+static void addPlainBufferToReplyIOV(char *buf, size_t buf_len, replyIOV *reply, bufWriteMetadata *metadata) {
+    if (reply->limit_reached) return;
+
+    if (reply->cnt == reply->max || reply->iov_len_total > NET_MAX_WRITES_PER_EVENT) {
+        reply->limit_reached = 1;
+        return;
+    }
+
+    /* Aggregate data len from the beginning of the buffer even though
+     * part of the data should be skipped in this round  due to last_written_len */
+    metadata->data_len += buf_len;
+
+    if (reply->last_written_len >= buf_len) {
+        reply->last_written_len -= buf_len;
+        return;
+    }
+
+    reply->iov[reply->cnt].iov_base = buf + reply->last_written_len;
+    reply->iov[reply->cnt].iov_len = buf_len - reply->last_written_len;
+    reply->last_written_len = 0;
+
+    reply->iov_len_total += reply->iov[reply->cnt++].iov_len;
+}
+
+static void addOffloadedBulkToReplyIOV(char *buf, size_t buf_len, replyIOV *reply, bufWriteMetadata *metadata) {
+    while (buf_len > 0 && !reply->limit_reached) {
+        robj **obj = (robj **)buf;
+        char *str = (*obj)->ptr;
+        size_t str_len = stringObjectLen(*obj);
+
+        char* prefix = reply->prefixes[reply->prfxcnt];
+        prefix[0] = '$';
+        size_t num_len = ll2string(prefix + 1, sizeof(reply->prefixes[0]) - 3, str_len);
+        prefix[num_len + 1] = '\r';
+        prefix[num_len + 2] = '\n';
+
+        int cnt = reply->cnt;
+        addPlainBufferToReplyIOV(reply->prefixes[reply->prfxcnt], num_len + 3, reply, metadata);
+        /* Increment prfxcnt only if prefix was added to reply in this round */
+        if (reply->cnt > cnt) reply->prfxcnt++;
+        addPlainBufferToReplyIOV(str, str_len, reply, metadata);
+        addPlainBufferToReplyIOV(reply->crlf, 2, reply, metadata);
+
+        buf += sizeof(void*);
+        buf_len -= sizeof(void*);
+    }
+}
+
+static void addCompoundBufferToReplyIOV(char *buf, size_t bufpos, replyIOV *reply, bufWriteMetadata *metadata) {
+    char *ptr = buf;
+    while (ptr < buf + bufpos && !reply->limit_reached) {
+        payloadHeader *header = (payloadHeader*)ptr;
+        ptr += sizeof(payloadHeader);
+        if (header->type == CLIENT_REPLY_PAYLOAD_DATA) {
+            addPlainBufferToReplyIOV(ptr, header->len, reply, metadata);
+        } else {
+            serverAssert(header->type == CLIENT_REPLY_PAYLOAD_BULK_OFFLOAD);
+            uint64_t data_len = metadata->data_len;
+            addOffloadedBulkToReplyIOV(ptr, header->len, reply, metadata);
+            /* Store actual reply len for cluster slot stats */
+            header->actual_len = metadata->data_len - data_len;
+        }
+        ptr += header->len;
+    }
+    serverAssert(ptr <= buf + bufpos);
+}
+
+static void addBufferToReplyIOV(char *buf, size_t bufpos, replyIOV *reply, bufWriteMetadata *metadata) {
+    metadata->data_len = 0;
+
+    if (reply->offload_active) {
+        addCompoundBufferToReplyIOV(buf, bufpos, reply, metadata);
+        metadata->complete = !reply->limit_reached;
+    } else {
+        addPlainBufferToReplyIOV(buf, bufpos, reply, metadata);
+        metadata->complete = 1;
+    }
+
+    metadata->buf = buf;
+    metadata->bufpos = bufpos;
+}
+
+static void saveLastWrittenBuf(client *c, bufWriteMetadata *metadata, int bufcnt, size_t totlen, size_t totwritten) {
+    int last = bufcnt - 1;
+    if (totwritten == totlen) {
+        c->io_last_written_buf = metadata[last].buf;
+        /* Zero io_last_written_bufpos indicates buffer written incompletely */
+        c->io_last_written_bufpos = (metadata[last].complete ? metadata[last].bufpos : 0);
+        c->io_last_written_data_len = metadata[last].data_len;
+        return;
+    }
+
+    last = -1;
+    int64_t remaining = totwritten + c->io_last_written_data_len;
+    while (remaining > 0) remaining -= metadata[++last].data_len;
+    serverAssert(last < bufcnt);
+
+    c->io_last_written_buf = metadata[last].buf;
+    /* Zero io_last_written_bufpos indicates buffer written incompletely */
+    c->io_last_written_bufpos = (metadata[last].complete && remaining == 0 ? metadata[last].bufpos : 0);
+    c->io_last_written_data_len = (size_t)(metadata[last].data_len + remaining);
+}
+
+void proceedToUnwritten(replyIOV *reply, int nwritten) {
+    while (nwritten > 0) {
+        if ((size_t)nwritten < reply->iov[0].iov_len) {
+            reply->iov[0].iov_base = (char *)reply->iov[0].iov_base + nwritten;
+            reply->iov[0].iov_len -= nwritten;
+            break;
+        }
+        nwritten -= reply->iov[0].iov_len;
+        reply->iov++;
+        reply->cnt--;
+    }
+}
+
 /* This function should be called from _writeToClient when the reply list is not empty,
  * it gathers the scattered buffers from reply list and sends them away with connWritev.
  * If we write successfully, it returns C_OK, otherwise, C_ERR is returned.
  * Sets the c->nwritten to the number of bytes the server wrote to the client.
- * Can be called from the main thread or an I/O thread */
+ * Can be called from the main thread or an I/O thread
+ *
+ * INTERNALS
+ * The writevToClient strives to write all client reply buffers to the client connection.
+ * However, it may encounter NET_MAX_WRITES_PER_EVENT or IOV_MAX or socket limit. In such case,
+ * some client reply buffers will be written completely and some partially.
+ * In next invocation writevToClient should resume from the exact position where it stopped.
+ * Also writevToClient should communicate to _postWriteToClient which buffers written completely
+ * and can be released. It is intricate in case of reply offloading as length of reply buffer does not match
+ * to network bytes out.
+ *
+ * For this purpose, writevToClient uses 3 data members on the client struct as input/output paramaters:
+ *    io_last_written_buf - Last buffer that has been written to the client connection
+ *    io_last_written_bufpos - The buffer has been written until this position
+ *    io_last_written_data_len - The actual length of the data written from this buffer
+ *                               This length differs from written bufpos in case of reply offload
+ *
+ * The writevToClient uses addBufferToReplyIOV, addCompoundBufferToReplyIOV, addOffloadedBulkToReplyIOV, addPlainBufferToReplyIOV
+ * to build reply iovec array. These functions know to skip io_last_written_data_len, specifically addPlainBufferToReplyIOV
+ *
+ * In the end of execution writevToClient calls saveLastWrittenBuf for calculating "last written" buf/pos/data_len
+ * and storing on the client. While building reply iov, writevToClient gathers auxiliary bufWriteMetadata that
+ * helps in this calculation. In some cases, It may take several (> 2) invocations for writevToClient to write reply
+ * from a single buffer but saveLastWrittenBuf knows to calculate "last written" buf/pos/data_len properly
+ *
+ * The _postWriteToClient uses io_last_written_buf and io_last_written_bufpos in order to detect completely written buffers
+ * and release them
+ *
+ * */
 static int writevToClient(client *c) {
-    int iovcnt = 0;
     int iovmax = min(IOV_MAX, c->conn->iovcnt);
     struct iovec iov_arr[iovmax];
-    struct iovec *iov = iov_arr;
-    ssize_t bufpos, iov_bytes_len = 0;
-    listNode *lastblock;
+    char prefixes[iovmax / 3 + 1][LONG_STR_SIZE + 3];
+    char crlf[2] = {'\r', '\n'};
+    int bufcnt = 0;
+    bufWriteMetadata metadata[listLength(c->reply) + 1];
 
+    replyIOV reply;
+    initReplyIOV(c, iovmax, iov_arr, prefixes, crlf, &reply);
+
+    size_t bufpos = 0;
+    listNode *lastblock;
     if (inMainThread()) {
         lastblock = listLast(c->reply);
         bufpos = c->bufpos;
@@ -2063,22 +2416,16 @@ static int writevToClient(client *c) {
     /* If the static reply buffer is not empty,
      * add it to the iov array for writev() as well. */
     if (bufpos > 0) {
-        iov[iovcnt].iov_base = c->buf + c->sentlen;
-        iov[iovcnt].iov_len = bufpos - c->sentlen;
-        iov_bytes_len += iov[iovcnt++].iov_len;
+        addBufferToReplyIOV(c->buf, bufpos, &reply, &metadata[bufcnt++]);
     }
-    /* The first node of reply list might be incomplete from the last call,
-     * thus it needs to be calibrated to get the actual data address and length. */
-    size_t sentlen = bufpos > 0 ? 0 : c->sentlen;
+
     listIter iter;
     listNode *next;
-    clientReplyBlock *o;
-    size_t used;
     listRewind(c->reply, &iter);
-    while ((next = listNext(&iter)) && iovcnt < iovmax && iov_bytes_len < NET_MAX_WRITES_PER_EVENT) {
-        o = listNodeValue(next);
+    while ((next = listNext(&iter)) && !reply.limit_reached) {
+        clientReplyBlock *o = listNodeValue(next);
 
-        used = o->used;
+        size_t used = o->used;
         /* Use c->io_last_bufpos as the currently used portion of the block.
          *  We use io_last_bufpos instead of o->used to ensure that we only access data guaranteed to be visible to the
          * current thread. Using o->used, which may have been updated by the main thread, could lead to accessing data
@@ -2087,23 +2434,22 @@ static int writevToClient(client *c) {
 
         if (used == 0) { /* empty node, skip over it. */
             if (next == lastblock) break;
-            sentlen = 0;
             continue;
         }
 
-        iov[iovcnt].iov_base = o->buf + sentlen;
-        iov[iovcnt].iov_len = used - sentlen;
-        iov_bytes_len += iov[iovcnt++].iov_len;
+        addBufferToReplyIOV(o->buf, used, &reply,  &metadata[bufcnt]);
+        if (!metadata[bufcnt].data_len) break;
+        bufcnt++;
 
-        sentlen = 0;
         if (next == lastblock) break;
     }
 
-    serverAssert(iovcnt != 0);
+    serverAssert(reply.last_written_len == 0);
+    serverAssert(reply.cnt != 0);
 
     ssize_t totwritten = 0;
     while (1) {
-        int nwritten = connWritev(c->conn, iov, iovcnt);
+        int nwritten = connWritev(c->conn, reply.iov, reply.cnt);
         if (nwritten <= 0) {
             c->write_flags |= WRITE_FLAGS_WRITE_ERROR;
             totwritten = totwritten > 0 ? totwritten : nwritten;
@@ -2111,7 +2457,7 @@ static int writevToClient(client *c) {
         }
         totwritten += nwritten;
 
-        if (totwritten == iov_bytes_len) break;
+        if (totwritten == reply.iov_len_total) break;
 
         if (totwritten > NET_MAX_WRITES_PER_EVENT) {
             /* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT
@@ -2128,20 +2474,13 @@ static int writevToClient(client *c) {
             }
         }
 
-        /* proceed to the unwritten blocks */
-        while (nwritten > 0) {
-            if ((size_t)nwritten < iov[0].iov_len) {
-                iov[0].iov_base = (char *)iov[0].iov_base + nwritten;
-                iov[0].iov_len -= nwritten;
-                break;
-            }
-            nwritten -= iov[0].iov_len;
-            iov++;
-            iovcnt--;
-        }
+        proceedToUnwritten(&reply, nwritten);
     }
 
     c->nwritten = totwritten;
+    if (totwritten > 0) {
+        saveLastWrittenBuf(c, metadata, bufcnt, reply.iov_len_total, totwritten);
+    }
     return totwritten > 0 ? C_OK : C_ERR;
 }
 
@@ -2163,13 +2502,14 @@ int _writeToClient(client *c) {
     }
 
     /* If the reply list is not empty, use writev to save system calls and TCP packets */
-    if (lastblock) return writevToClient(c);
+    if (lastblock || c->flag.reply_offload) return writevToClient(c);
 
-    ssize_t bytes_to_write = bufpos - c->sentlen;
+    serverAssert(c->io_last_written_data_len == 0 || c->io_last_written_buf == c->buf);
+    ssize_t bytes_to_write = bufpos - c->io_last_written_data_len;
     ssize_t tot_written = 0;
 
     while (tot_written < bytes_to_write) {
-        int nwritten = connWrite(c->conn, c->buf + c->sentlen + tot_written, bytes_to_write - tot_written);
+        int nwritten = connWrite(c->conn, c->buf + c->io_last_written_data_len + tot_written, bytes_to_write - tot_written);
         if (nwritten <= 0) {
             c->write_flags |= WRITE_FLAGS_WRITE_ERROR;
             tot_written = tot_written > 0 ? tot_written : nwritten;
@@ -2179,44 +2519,89 @@ int _writeToClient(client *c) {
     }
 
     c->nwritten = tot_written;
+    if (tot_written > 0) {
+        c->io_last_written_buf = c->buf;
+        c->io_last_written_bufpos = (tot_written == bytes_to_write ? bufpos : 0);
+        c->io_last_written_data_len = c->io_last_written_data_len + tot_written;
+    }
     return tot_written > 0 ? C_OK : C_ERR;
 }
 
-static void _postWriteToClient(client *c) {
-    if (c->nwritten <= 0) return;
+void resetLastWrittenBuf(client *c) {
+    c->io_last_written_buf = NULL;
+    c->io_last_written_bufpos = 0;
+    c->io_last_written_data_len = 0;
+}
+
+static void releaseBufOffloads(char *buf, size_t bufpos) {
+    char *ptr = buf;
+    while (ptr < buf + bufpos) {
+        payloadHeader *header = (payloadHeader *)ptr;
+        ptr += sizeof(payloadHeader);
+
+        if (header->type == CLIENT_REPLY_PAYLOAD_BULK_OFFLOAD) {
+            clusterSlotStatsAddNetworkBytesOutForSlot(header->slot, header->actual_len);
+
+            robj** obj_ptr = (robj**)ptr;
+            size_t len = header->len;
+            while (len > 0) {
+                decrRefCount(*obj_ptr);
+                obj_ptr++;
+                len -= sizeof(obj_ptr);
+            }
+        }
+
+        ptr += header->len;
+    }
+}
+
+void releaseReplyOffloads(client *c) {
+    if (c->bufpos > 0) {
+        releaseBufOffloads(c->buf, c->bufpos);
+    }
 
     listIter iter;
     listNode *next;
-    clientReplyBlock *o;
+    listRewind(c->reply, &iter);
+    while ((next = listNext(&iter))) {
+        clientReplyBlock *o = (clientReplyBlock *)listNodeValue(next);
+        releaseBufOffloads(o->buf, o->used);
+    }
+}
 
+/*
+ * See INTERNALS note on writevToClient for explanation about
+ * io_last_written_buf and io_last_written_bufpos
+ */
+static void _postWriteToClient(client *c) {
+    if (c->nwritten <= 0) return;
     server.stat_net_output_bytes += c->nwritten;
 
-    /* Locate the new node which has leftover data and
-     * release all nodes in front of it. */
-    ssize_t remaining = c->nwritten;
-    if (c->bufpos > 0) { /* Deal with static reply buffer first. */
-        int buf_len = c->bufpos - c->sentlen;
-        c->sentlen += c->nwritten;
-        /* If the buffer was sent, set bufpos to zero to continue with
-         * the remainder of the reply. */
-        if (c->nwritten >= buf_len) {
+    int last_written = 0;
+    if (c->bufpos > 0) {
+        last_written = (c->buf == c->io_last_written_buf);
+        if (!last_written || c->bufpos == c->io_last_written_bufpos) {
+            if (c->flag.reply_offload) releaseBufOffloads(c->buf, c->bufpos);
             c->bufpos = 0;
-            c->sentlen = 0;
+            c->last_header = 0;
+            if (last_written) resetLastWrittenBuf(c);
         }
-        remaining -= buf_len;
+        if (last_written) return;
     }
+
+    listIter iter;
+    listNode *next;
     listRewind(c->reply, &iter);
-    while (remaining > 0) {
-        next = listNext(&iter);
-        o = listNodeValue(next);
-        if (remaining < (ssize_t)(o->used - c->sentlen)) {
-            c->sentlen += remaining;
-            break;
+    while ((next = listNext(&iter))) {
+        clientReplyBlock *o = listNodeValue(next);
+        last_written = (o->buf == c->io_last_written_buf);
+        if (!last_written || o->used == c->io_last_written_bufpos) {
+            c->reply_bytes -= o->size;
+            if (c->flag.reply_offload) releaseBufOffloads(o->buf, o->used);
+            listDelNode(c->reply, next);
+            if (last_written) resetLastWrittenBuf(c);
         }
-        remaining -= (ssize_t)(o->used - c->sentlen);
-        c->reply_bytes -= o->size;
-        listDelNode(c->reply, next);
-        c->sentlen = 0;
+        if (last_written) return;
     }
 }
 
@@ -2248,7 +2633,7 @@ int postWriteToClient(client *c) {
         if (!c->flag.primary) c->last_interaction = server.unixtime;
     }
     if (!clientHasPendingReplies(c)) {
-        c->sentlen = 0;
+        resetLastWrittenBuf(c);
         if (connHasWriteHandler(c->conn)) {
             connSetWriteHandler(c->conn, NULL);
         }
diff --git a/src/replication.c b/src/replication.c
index b5ce77f5e0..58bbd4379b 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -888,6 +888,7 @@ int primaryTryPartialResynchronization(client *c, long long psync_offset) {
      * 4) Send the backlog data (from the offset to the end) to the replica. */
     waitForClientIO(c);
     c->flag.replica = 1;
+    c->flag.reply_offload = 0;
     if (c->associated_rdb_client_id && lookupRdbClientByID(c->associated_rdb_client_id)) {
         c->repl_state = REPLICA_STATE_BG_RDB_LOAD;
         removeReplicaFromPsyncWait(c);
@@ -1151,6 +1152,7 @@ void syncCommand(client *c) {
     /* Wait for any IO pending operation to finish before changing the client state */
     waitForClientIO(c);
     c->flag.replica = 1;
+    c->flag.reply_offload = 0;
     listAddNodeTail(server.replicas, c);
 
     /* Create the replication backlog if needed. */
@@ -4148,8 +4150,11 @@ void replicationCachePrimary(client *c) {
     server.primary->repl_applied = 0;
     server.primary->read_reploff = server.primary->reploff;
     if (c->flag.multi) discardTransaction(c);
+    if (c->flag.reply_offload) {
+        releaseReplyOffloads(c);
+        resetLastWrittenBuf(c);
+    }
     listEmpty(c->reply);
-    c->sentlen = 0;
     c->reply_bytes = 0;
     c->bufpos = 0;
     resetClient(c);
diff --git a/src/server.h b/src/server.h
index dc4d2e8808..22edc395f4 100644
--- a/src/server.h
+++ b/src/server.h
@@ -918,10 +918,13 @@ char *getObjectTypeName(robj *);
 
 struct evictionPoolEntry; /* Defined in evict.c */
 
+typedef struct payloadHeader payloadHeader; /* Defined in networking.c */
+
 /* This structure is used in order to represent the output buffer of a client,
  * which is actually a linked list of blocks like that, that is: client->reply. */
 typedef struct clientReplyBlock {
     size_t size, used;
+    payloadHeader* last_header;
     char buf[];
 } clientReplyBlock;
 
@@ -1189,6 +1192,7 @@ typedef struct ClientFlags {
     uint64_t prevent_prop : 1;             /* Don't propagate to AOF or replicas. */
     uint64_t pending_write : 1;            /* Client has output to send but a write handler is yet not installed. */
     uint64_t pending_read : 1;             /* Client has output to send but a write handler is yet not installed. */
+    uint64_t reply_offload : 1;            /* Client is in reply offload mode */
     uint64_t reply_off : 1;                /* Don't send replies to client. */
     uint64_t reply_skip_next : 1;          /* Set CLIENT_REPLY_SKIP for next cmd */
     uint64_t reply_skip : 1;               /* Don't send just this reply. */
@@ -1241,7 +1245,7 @@ typedef struct ClientFlags {
                                             * flag, we won't cache the primary in freeClient. */
     uint64_t fake : 1;                     /* This is a fake client without a real connection. */
     uint64_t import_source : 1;            /* This client is importing data to server and can visit expired key. */
-    uint64_t reserved : 4;                 /* Reserved for future use */
+    uint64_t reserved : 3;                 /* Reserved for future use */
 } ClientFlags;
 
 typedef struct client {
@@ -1286,10 +1290,13 @@ typedef struct client {
     long bulklen;                        /* Length of bulk argument in multi bulk request. */
     list *reply;                         /* List of reply objects to send to the client. */
     listNode *io_last_reply_block;       /* Last client reply block when sent to IO thread */
+    size_t io_last_bufpos;               /* The client's bufpos at the time it was sent to the IO thread */
+    char* io_last_written_buf;           /* Last buffer that has been written to the client connection */
+    size_t io_last_written_bufpos;       /* The buffer has been written until this position */
+    size_t io_last_written_data_len;     /* The actual length of the data written from this buffer
+                                            This length differs from written bufpos in case of reply offload */
     unsigned long long reply_bytes;      /* Tot bytes of objects in reply list. */
     list *deferred_reply_errors;         /* Used for module thread safe contexts. */
-    size_t sentlen;                      /* Amount of bytes already sent in the current
-                                            buffer or object being sent. */
     time_t ctime;                        /* Client creation time. */
     long duration;                       /* Current command duration. Used for measuring latency of blocking/non-blocking cmds */
     int slot;                            /* The slot the client is executing against. Set to -1 if no slot is being used */
@@ -1376,8 +1383,8 @@ typedef struct client {
     /* Response buffer */
     size_t buf_peak;                   /* Peak used size of buffer in last 5 sec interval. */
     mstime_t buf_peak_last_reset_time; /* keeps the last time the buffer peak value was reset */
-    int bufpos;
-    size_t io_last_bufpos;  /* The client's bufpos at the time it was sent to the IO thread */
+    size_t bufpos;
+    payloadHeader* last_header;        /* Pointer to the last header in a buffer in reply offload mode */
     size_t buf_usable_size; /* Usable size of buffer. */
     char *buf;
 #ifdef LOG_REQ_RES
@@ -1789,6 +1796,7 @@ struct valkeyServer {
     int events_per_io_thread;                 /* Number of events on the event loop to trigger IO threads activation. */
     int prefetch_batch_max_size;              /* Maximum number of keys to prefetch in a single batch */
     long long events_processed_while_blocked; /* processEventsWhileBlocked() */
+    int reply_offload_enabled;                /* Reply offload enabled or not */
     int enable_protected_configs;             /* Enable the modification of protected configs, see PROTECTED_ACTION_ALLOWED_* */
     int enable_debug_cmd;                     /* Enable DEBUG commands, see PROTECTED_ACTION_ALLOWED_* */
     int enable_module_cmd;                    /* Enable MODULE commands, see PROTECTED_ACTION_ALLOWED_* */
@@ -2928,6 +2936,9 @@ void ioThreadWriteToClient(void *data);
 int canParseCommand(client *c);
 int processIOThreadsReadDone(void);
 int processIOThreadsWriteDone(void);
+void releaseReplyOffloads(client *c);
+void resetLastWrittenBuf(client *c);
+
 
 /* logreqres.c - logging of requests and responses */
 void reqresReset(client *c, int free_buf);
diff --git a/src/unit/test_files.h b/src/unit/test_files.h
index f25e320452..8c30d7625d 100644
--- a/src/unit/test_files.h
+++ b/src/unit/test_files.h
@@ -102,6 +102,9 @@ int test_listpackBenchmarkLpCompareWithNumber(int argc, char **argv, int flags);
 int test_listpackBenchmarkFree(int argc, char **argv, int flags);
 int test_backupAndUpdateClientArgv(int argc, char **argv, int flags);
 int test_rewriteClientCommandArgument(int argc, char **argv, int flags);
+int test_addRepliesWithOffloadsToBuffer(int argc, char **argv, int flags);
+int test_addRepliesWithOffloadsToList(int argc, char **argv, int flags);
+int test_addBufferToReplyIOV(int argc, char **argv, int flags);
 int test_object_with_key(int argc, char **argv, int flags);
 int test_quicklistCreateList(int argc, char **argv, int flags);
 int test_quicklistAddToTailOfEmptyList(int argc, char **argv, int flags);
@@ -236,7 +239,7 @@ unitTest __test_hashtable_c[] = {{"test_cursor", test_cursor}, {"test_set_hash_f
 unitTest __test_intset_c[] = {{"test_intsetValueEncodings", test_intsetValueEncodings}, {"test_intsetBasicAdding", test_intsetBasicAdding}, {"test_intsetLargeNumberRandomAdd", test_intsetLargeNumberRandomAdd}, {"test_intsetUpgradeFromint16Toint32", test_intsetUpgradeFromint16Toint32}, {"test_intsetUpgradeFromint16Toint64", test_intsetUpgradeFromint16Toint64}, {"test_intsetUpgradeFromint32Toint64", test_intsetUpgradeFromint32Toint64}, {"test_intsetStressLookups", test_intsetStressLookups}, {"test_intsetStressAddDelete", test_intsetStressAddDelete}, {NULL, NULL}};
 unitTest __test_kvstore_c[] = {{"test_kvstoreAdd16Keys", test_kvstoreAdd16Keys}, {"test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyHashtable", test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyHashtable}, {"test_kvstoreIteratorRemoveAllKeysDeleteEmptyHashtable", test_kvstoreIteratorRemoveAllKeysDeleteEmptyHashtable}, {"test_kvstoreHashtableIteratorRemoveAllKeysNoDeleteEmptyHashtable", test_kvstoreHashtableIteratorRemoveAllKeysNoDeleteEmptyHashtable}, {"test_kvstoreHashtableIteratorRemoveAllKeysDeleteEmptyHashtable", test_kvstoreHashtableIteratorRemoveAllKeysDeleteEmptyHashtable}, {NULL, NULL}};
 unitTest __test_listpack_c[] = {{"test_listpackCreateIntList", test_listpackCreateIntList}, {"test_listpackCreateList", test_listpackCreateList}, {"test_listpackLpPrepend", test_listpackLpPrepend}, {"test_listpackLpPrependInteger", test_listpackLpPrependInteger}, {"test_listpackGetELementAtIndex", test_listpackGetELementAtIndex}, {"test_listpackPop", test_listpackPop}, {"test_listpackGetELementAtIndex2", test_listpackGetELementAtIndex2}, {"test_listpackIterate0toEnd", test_listpackIterate0toEnd}, {"test_listpackIterate1toEnd", test_listpackIterate1toEnd}, {"test_listpackIterate2toEnd", test_listpackIterate2toEnd}, {"test_listpackIterateBackToFront", test_listpackIterateBackToFront}, {"test_listpackIterateBackToFrontWithDelete", test_listpackIterateBackToFrontWithDelete}, {"test_listpackDeleteWhenNumIsMinusOne", test_listpackDeleteWhenNumIsMinusOne}, {"test_listpackDeleteWithNegativeIndex", test_listpackDeleteWithNegativeIndex}, {"test_listpackDeleteInclusiveRange0_0", test_listpackDeleteInclusiveRange0_0}, {"test_listpackDeleteInclusiveRange0_1", test_listpackDeleteInclusiveRange0_1}, {"test_listpackDeleteInclusiveRange1_2", test_listpackDeleteInclusiveRange1_2}, {"test_listpackDeleteWitStartIndexOutOfRange", test_listpackDeleteWitStartIndexOutOfRange}, {"test_listpackDeleteWitNumOverflow", test_listpackDeleteWitNumOverflow}, {"test_listpackBatchDelete", test_listpackBatchDelete}, {"test_listpackDeleteFooWhileIterating", test_listpackDeleteFooWhileIterating}, {"test_listpackReplaceWithSameSize", test_listpackReplaceWithSameSize}, {"test_listpackReplaceWithDifferentSize", test_listpackReplaceWithDifferentSize}, {"test_listpackRegressionGt255Bytes", test_listpackRegressionGt255Bytes}, {"test_listpackCreateLongListAndCheckIndices", test_listpackCreateLongListAndCheckIndices}, {"test_listpackCompareStrsWithLpEntries", test_listpackCompareStrsWithLpEntries}, {"test_listpackLpMergeEmptyLps", test_listpackLpMergeEmptyLps}, {"test_listpackLpMergeLp1Larger", test_listpackLpMergeLp1Larger}, {"test_listpackLpMergeLp2Larger", test_listpackLpMergeLp2Larger}, {"test_listpackLpNextRandom", test_listpackLpNextRandom}, {"test_listpackLpNextRandomCC", test_listpackLpNextRandomCC}, {"test_listpackRandomPairWithOneElement", test_listpackRandomPairWithOneElement}, {"test_listpackRandomPairWithManyElements", test_listpackRandomPairWithManyElements}, {"test_listpackRandomPairsWithOneElement", test_listpackRandomPairsWithOneElement}, {"test_listpackRandomPairsWithManyElements", test_listpackRandomPairsWithManyElements}, {"test_listpackRandomPairsUniqueWithOneElement", test_listpackRandomPairsUniqueWithOneElement}, {"test_listpackRandomPairsUniqueWithManyElements", test_listpackRandomPairsUniqueWithManyElements}, {"test_listpackPushVariousEncodings", test_listpackPushVariousEncodings}, {"test_listpackLpFind", test_listpackLpFind}, {"test_listpackLpValidateIntegrity", test_listpackLpValidateIntegrity}, {"test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN", test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN}, {"test_listpackStressWithRandom", test_listpackStressWithRandom}, {"test_listpackSTressWithVariableSize", test_listpackSTressWithVariableSize}, {"test_listpackBenchmarkInit", test_listpackBenchmarkInit}, {"test_listpackBenchmarkLpAppend", test_listpackBenchmarkLpAppend}, {"test_listpackBenchmarkLpFindString", test_listpackBenchmarkLpFindString}, {"test_listpackBenchmarkLpFindNumber", test_listpackBenchmarkLpFindNumber}, {"test_listpackBenchmarkLpSeek", test_listpackBenchmarkLpSeek}, {"test_listpackBenchmarkLpValidateIntegrity", test_listpackBenchmarkLpValidateIntegrity}, {"test_listpackBenchmarkLpCompareWithString", test_listpackBenchmarkLpCompareWithString}, {"test_listpackBenchmarkLpCompareWithNumber", test_listpackBenchmarkLpCompareWithNumber}, {"test_listpackBenchmarkFree", test_listpackBenchmarkFree}, {NULL, NULL}};
-unitTest __test_networking_c[] = {{"test_backupAndUpdateClientArgv", test_backupAndUpdateClientArgv}, {"test_rewriteClientCommandArgument", test_rewriteClientCommandArgument}, {NULL, NULL}};
+unitTest __test_networking_c[] = {{"test_backupAndUpdateClientArgv", test_backupAndUpdateClientArgv}, {"test_rewriteClientCommandArgument", test_rewriteClientCommandArgument}, {"test_addRepliesWithOffloadsToBuffer", test_addRepliesWithOffloadsToBuffer}, {"test_addRepliesWithOffloadsToList", test_addRepliesWithOffloadsToList}, {"test_addBufferToReplyIOV", test_addBufferToReplyIOV}, {NULL, NULL}};
 unitTest __test_object_c[] = {{"test_object_with_key", test_object_with_key}, {NULL, NULL}};
 unitTest __test_quicklist_c[] = {{"test_quicklistCreateList", test_quicklistCreateList}, {"test_quicklistAddToTailOfEmptyList", test_quicklistAddToTailOfEmptyList}, {"test_quicklistAddToHeadOfEmptyList", test_quicklistAddToHeadOfEmptyList}, {"test_quicklistAddToTail5xAtCompress", test_quicklistAddToTail5xAtCompress}, {"test_quicklistAddToHead5xAtCompress", test_quicklistAddToHead5xAtCompress}, {"test_quicklistAddToTail500xAtCompress", test_quicklistAddToTail500xAtCompress}, {"test_quicklistAddToHead500xAtCompress", test_quicklistAddToHead500xAtCompress}, {"test_quicklistRotateEmpty", test_quicklistRotateEmpty}, {"test_quicklistComprassionPlainNode", test_quicklistComprassionPlainNode}, {"test_quicklistNextPlainNode", test_quicklistNextPlainNode}, {"test_quicklistRotatePlainNode", test_quicklistRotatePlainNode}, {"test_quicklistRotateOneValOnce", test_quicklistRotateOneValOnce}, {"test_quicklistRotate500Val5000TimesAtCompress", test_quicklistRotate500Val5000TimesAtCompress}, {"test_quicklistPopEmpty", test_quicklistPopEmpty}, {"test_quicklistPop1StringFrom1", test_quicklistPop1StringFrom1}, {"test_quicklistPopHead1NumberFrom1", test_quicklistPopHead1NumberFrom1}, {"test_quicklistPopHead500From500", test_quicklistPopHead500From500}, {"test_quicklistPopHead5000From500", test_quicklistPopHead5000From500}, {"test_quicklistIterateForwardOver500List", test_quicklistIterateForwardOver500List}, {"test_quicklistIterateReverseOver500List", test_quicklistIterateReverseOver500List}, {"test_quicklistInsertAfter1Element", test_quicklistInsertAfter1Element}, {"test_quicklistInsertBefore1Element", test_quicklistInsertBefore1Element}, {"test_quicklistInsertHeadWhileHeadNodeIsFull", test_quicklistInsertHeadWhileHeadNodeIsFull}, {"test_quicklistInsertTailWhileTailNodeIsFull", test_quicklistInsertTailWhileTailNodeIsFull}, {"test_quicklistInsertOnceInElementsWhileIteratingAtCompress", test_quicklistInsertOnceInElementsWhileIteratingAtCompress}, {"test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress", test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress}, {"test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress", test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress}, {"test_quicklistDuplicateEmptyList", test_quicklistDuplicateEmptyList}, {"test_quicklistDuplicateListOf1Element", test_quicklistDuplicateListOf1Element}, {"test_quicklistDuplicateListOf500", test_quicklistDuplicateListOf500}, {"test_quicklistIndex1200From500ListAtFill", test_quicklistIndex1200From500ListAtFill}, {"test_quicklistIndex12From500ListAtFill", test_quicklistIndex12From500ListAtFill}, {"test_quicklistIndex100From500ListAtFill", test_quicklistIndex100From500ListAtFill}, {"test_quicklistIndexTooBig1From50ListAtFill", test_quicklistIndexTooBig1From50ListAtFill}, {"test_quicklistDeleteRangeEmptyList", test_quicklistDeleteRangeEmptyList}, {"test_quicklistDeleteRangeOfEntireNodeInListOfOneNode", test_quicklistDeleteRangeOfEntireNodeInListOfOneNode}, {"test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts", test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts}, {"test_quicklistDeleteMiddle100Of500List", test_quicklistDeleteMiddle100Of500List}, {"test_quicklistDeleteLessThanFillButAcrossNodes", test_quicklistDeleteLessThanFillButAcrossNodes}, {"test_quicklistDeleteNegative1From500List", test_quicklistDeleteNegative1From500List}, {"test_quicklistDeleteNegative1From500ListWithOverflowCounts", test_quicklistDeleteNegative1From500ListWithOverflowCounts}, {"test_quicklistDeleteNegative100From500List", test_quicklistDeleteNegative100From500List}, {"test_quicklistDelete10Count5From50List", test_quicklistDelete10Count5From50List}, {"test_quicklistNumbersOnlyListRead", test_quicklistNumbersOnlyListRead}, {"test_quicklistNumbersLargerListRead", test_quicklistNumbersLargerListRead}, {"test_quicklistNumbersLargerListReadB", test_quicklistNumbersLargerListReadB}, {"test_quicklistLremTestAtCompress", test_quicklistLremTestAtCompress}, {"test_quicklistIterateReverseDeleteAtCompress", test_quicklistIterateReverseDeleteAtCompress}, {"test_quicklistIteratorAtIndexTestAtCompress", test_quicklistIteratorAtIndexTestAtCompress}, {"test_quicklistLtrimTestAAtCompress", test_quicklistLtrimTestAAtCompress}, {"test_quicklistLtrimTestBAtCompress", test_quicklistLtrimTestBAtCompress}, {"test_quicklistLtrimTestCAtCompress", test_quicklistLtrimTestCAtCompress}, {"test_quicklistLtrimTestDAtCompress", test_quicklistLtrimTestDAtCompress}, {"test_quicklistVerifySpecificCompressionOfInteriorNodes", test_quicklistVerifySpecificCompressionOfInteriorNodes}, {"test_quicklistBookmarkGetUpdatedToNextItem", test_quicklistBookmarkGetUpdatedToNextItem}, {"test_quicklistBookmarkLimit", test_quicklistBookmarkLimit}, {"test_quicklistCompressAndDecompressQuicklistListpackNode", test_quicklistCompressAndDecompressQuicklistListpackNode}, {"test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX", test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX}, {NULL, NULL}};
 unitTest __test_rax_c[] = {{"test_raxRandomWalk", test_raxRandomWalk}, {"test_raxIteratorUnitTests", test_raxIteratorUnitTests}, {"test_raxTryInsertUnitTests", test_raxTryInsertUnitTests}, {"test_raxRegressionTest1", test_raxRegressionTest1}, {"test_raxRegressionTest2", test_raxRegressionTest2}, {"test_raxRegressionTest3", test_raxRegressionTest3}, {"test_raxRegressionTest4", test_raxRegressionTest4}, {"test_raxRegressionTest5", test_raxRegressionTest5}, {"test_raxRegressionTest6", test_raxRegressionTest6}, {"test_raxBenchmark", test_raxBenchmark}, {"test_raxHugeKey", test_raxHugeKey}, {"test_raxFuzz", test_raxFuzz}, {NULL, NULL}};
diff --git a/src/unit/test_networking.c b/src/unit/test_networking.c
index 566583bcc5..6eeb20302a 100644
--- a/src/unit/test_networking.c
+++ b/src/unit/test_networking.c
@@ -129,3 +129,241 @@ int test_rewriteClientCommandArgument(int argc, char **argv, int flags) {
 
     return 0;
 }
+
+static client* createTestClient(void) {
+    client *c = zcalloc(sizeof(client));
+
+    c->buf = zmalloc_usable(PROTO_REPLY_CHUNK_BYTES, &c->buf_usable_size);
+    c->reply = listCreate();
+    listSetFreeMethod(c->reply, freeClientReplyValue);
+    listSetDupMethod(c->reply, dupClientReplyValue);
+    c->flag.reply_offload = 1;
+    c->flag.fake = 1;
+
+    return c;
+}
+
+static void freeReplyOffloadClient(client *c) {
+    listRelease(c->reply);
+    zfree(c->buf);
+    zfree(c);
+}
+
+int test_addRepliesWithOffloadsToBuffer(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+
+    client * c = createTestClient();
+
+    /* Test 1:  Add bulk offloads to the buffer */
+    robj *obj = createObject(OBJ_STRING, sdscatfmt(sdsempty(), "test"));
+    _addBulkOffloadToBufferOrList(c, obj);
+
+    TEST_ASSERT(obj->refcount == 2);
+    TEST_ASSERT(c->bufpos == sizeof(payloadHeader) + sizeof(void*));
+
+    payloadHeader *header1 = c->last_header;
+    TEST_ASSERT(header1->type == CLIENT_REPLY_PAYLOAD_BULK_OFFLOAD);
+    TEST_ASSERT(header1->len == sizeof(void*));
+
+    robj **ptr = (robj **)(c->buf + sizeof(payloadHeader));
+    TEST_ASSERT(obj == *ptr);
+
+    robj *obj2 = createObject(OBJ_STRING, sdscatfmt(sdsempty(), "test2"));
+    _addBulkOffloadToBufferOrList(c, obj2);
+
+    TEST_ASSERT(c->bufpos == sizeof(payloadHeader) + 2 * sizeof(void*));
+    TEST_ASSERT(header1->type == CLIENT_REPLY_PAYLOAD_BULK_OFFLOAD);
+    TEST_ASSERT(header1->len == 2 * sizeof(void*));
+
+    ptr = (robj **)(c->buf + sizeof(payloadHeader) + sizeof(void*));
+    TEST_ASSERT(obj2 == *ptr);
+
+    /* Test 2:  Add plain reply to the buffer */
+    const char* plain = "+OK\r\n";
+    size_t plain_len = strlen(plain);
+    _addReplyToBufferOrList(c, plain, plain_len);
+
+    TEST_ASSERT(c->bufpos == 2 * sizeof(payloadHeader) + 2 * sizeof(void*) + plain_len);
+    TEST_ASSERT(header1->type == CLIENT_REPLY_PAYLOAD_BULK_OFFLOAD);
+    TEST_ASSERT(header1->len == 2 * sizeof(void*));
+    payloadHeader *header2 = c->last_header;
+    TEST_ASSERT(header2->type == CLIENT_REPLY_PAYLOAD_DATA);
+    TEST_ASSERT(header2->len == plain_len);
+
+    for (int i = 0; i < 9; ++i) _addReplyToBufferOrList(c, plain, plain_len);
+    TEST_ASSERT(c->bufpos == 2 * sizeof(payloadHeader) + 2 * sizeof(void*) + 10 * plain_len);
+    TEST_ASSERT(header2->type == CLIENT_REPLY_PAYLOAD_DATA);
+    TEST_ASSERT(header2->len == plain_len * 10);
+
+    /* Test 3:  Add one more bulk offload to the buffer */
+    _addBulkOffloadToBufferOrList(c, obj);
+    TEST_ASSERT(obj->refcount == 3);
+    TEST_ASSERT(c->bufpos == 3 * sizeof(payloadHeader) + 3 * sizeof(void*) + 10 * plain_len);
+    payloadHeader *header3 = c->last_header;
+    TEST_ASSERT(header3->type == CLIENT_REPLY_PAYLOAD_BULK_OFFLOAD);
+    ptr = (robj **)((char*)c->last_header + sizeof(payloadHeader));
+    TEST_ASSERT(obj == *ptr);
+
+    decrRefCount(obj);
+    decrRefCount(obj);
+    decrRefCount(obj);
+
+    decrRefCount(obj2);
+    decrRefCount(obj2);
+
+    freeReplyOffloadClient(c);
+
+    return 0;
+}
+
+int test_addRepliesWithOffloadsToList(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+
+    client *c = createTestClient();
+
+    /* Test 1:  Add bulk offloads to the reply list */
+
+    /* Reply len to fill the buffer almost completely */
+    size_t reply_len = c->buf_usable_size - 2 * sizeof(payloadHeader) - 4;
+
+    char *reply = zmalloc(reply_len);
+    memset(reply, 'a', reply_len);
+    _addReplyToBufferOrList(c, reply, reply_len);
+    TEST_ASSERT(c->bufpos == sizeof(payloadHeader) + reply_len);
+    TEST_ASSERT(listLength(c->reply) == 0);
+
+    robj *obj = createObject(OBJ_STRING, sdscatfmt(sdsempty(), "test"));
+    _addBulkOffloadToBufferOrList(c, obj);
+
+    TEST_ASSERT(obj->refcount == 2);
+    TEST_ASSERT(c->bufpos == sizeof(payloadHeader) + reply_len);
+    TEST_ASSERT(listLength(c->reply) == 1);
+
+    listIter iter;
+    listRewind(c->reply, &iter);
+    listNode *next = listNext(&iter);
+    clientReplyBlock *blk = listNodeValue(next);
+
+    TEST_ASSERT(blk->used == sizeof(payloadHeader) + sizeof(void*));
+    payloadHeader *header1 = blk->last_header;
+    TEST_ASSERT(header1->type == CLIENT_REPLY_PAYLOAD_BULK_OFFLOAD);
+    TEST_ASSERT(header1->len == sizeof(void*));
+
+    robj **ptr = (robj **)(blk->buf + sizeof(payloadHeader));
+    TEST_ASSERT(obj == *ptr);
+
+    /* Test 2:  Add one more bulk offload to the reply list */
+    _addBulkOffloadToBufferOrList(c, obj);
+    TEST_ASSERT(obj->refcount == 3);
+    TEST_ASSERT(listLength(c->reply) == 1);
+    TEST_ASSERT(blk->used == sizeof(payloadHeader) + 2 * sizeof(void*));
+    TEST_ASSERT(header1->type == CLIENT_REPLY_PAYLOAD_BULK_OFFLOAD);
+    TEST_ASSERT(header1->len == 2 * sizeof(void*));
+
+    /* Test 3: Add plain replies to cause reply list grow  */
+    while (reply_len < blk->size - blk->used) _addReplyToBufferOrList(c, reply, reply_len);
+    _addReplyToBufferOrList(c, reply, reply_len);
+
+    TEST_ASSERT(listLength(c->reply) == 2);
+    /* last header in 1st block */
+    payloadHeader *header2 = blk->last_header;
+    listRewind(c->reply, &iter);
+    listNext(&iter);
+    next = listNext(&iter);
+    clientReplyBlock *blk2 = listNodeValue(next);
+    /* last header in 2nd block */
+    payloadHeader *header3 = blk2->last_header;
+    TEST_ASSERT(header2->type == CLIENT_REPLY_PAYLOAD_DATA && header3->type == CLIENT_REPLY_PAYLOAD_DATA);
+    TEST_ASSERT((header2->len + header3->len) % reply_len == 0);
+
+    decrRefCount(obj);
+    decrRefCount(obj);
+    decrRefCount(obj);
+
+    freeReplyOffloadClient(c);
+
+    return 0;
+}
+
+int test_addBufferToReplyIOV(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+
+    const char* expected_reply = "$5\r\nhello\r\n";
+    ssize_t total_len = strlen(expected_reply);
+    const int iovmax = 16;
+    char crlf[2] = {'\r', '\n'};
+
+    /* Test 1: 1st writevToclient invocation */
+    client *c = createTestClient();
+    robj *obj = createObject(OBJ_STRING, sdscatfmt(sdsempty(), "hello"));
+    _addBulkOffloadToBufferOrList(c, obj);
+
+    struct iovec iov_arr[iovmax];
+    char prefixes[iovmax / 3 + 1][LONG_STR_SIZE + 3];
+    bufWriteMetadata metadata[1];
+
+    replyIOV reply;
+    initReplyIOV(c, iovmax, iov_arr, prefixes, crlf, &reply);
+    addBufferToReplyIOV(c->buf, c->bufpos, &reply, &metadata[0]);
+
+    TEST_ASSERT(reply.iov_len_total == total_len);
+    TEST_ASSERT(reply.cnt == 3);
+    const char* ptr = expected_reply;
+    for (int i = 0; i < reply.cnt; ++i) {
+        TEST_ASSERT(memcmp(ptr, reply.iov[i].iov_base, reply.iov[i].iov_len) == 0);
+        ptr += reply.iov[i].iov_len;
+    }
+
+    /* Test 2: Last written buf/pos/data_len after 1st invocation */
+    saveLastWrittenBuf(c, metadata, 1, reply.iov_len_total, 1); /* only 1 byte has been written */
+    TEST_ASSERT(c->io_last_written_buf == c->buf);
+    TEST_ASSERT(c->io_last_written_bufpos == 0); /* incomplete write */
+    TEST_ASSERT(c->io_last_written_data_len == 1);
+
+    /* Test 3: 2nd writevToclient invocation */
+    struct iovec iov_arr2[iovmax];
+    char prefixes2[iovmax / 3 + 1][LONG_STR_SIZE + 3];
+    bufWriteMetadata metadata2[1];
+
+    replyIOV reply2;
+    initReplyIOV(c, iovmax, iov_arr2, prefixes2, crlf, &reply2);
+    addBufferToReplyIOV(c->buf, c->bufpos, &reply2, &metadata2[0]);
+    TEST_ASSERT(reply2.iov_len_total == total_len - 1);
+    TEST_ASSERT((*(char*)reply2.iov[0].iov_base) == '5');
+
+    /* Test 4: Last written buf/pos/data_len after 2nd invocation */
+    saveLastWrittenBuf(c, metadata2, 1, reply2.iov_len_total, 4); /* 4 more bytes has been written */
+    TEST_ASSERT(c->io_last_written_buf == c->buf);
+    TEST_ASSERT(c->io_last_written_bufpos == 0); /* incomplete write */
+    TEST_ASSERT(c->io_last_written_data_len == 5); /* 1 + 4 */
+
+    /* Test 5: 3rd writevToclient invocation */
+    struct iovec iov_arr3[iovmax];
+    char prefixes3[iovmax / 3 + 1][LONG_STR_SIZE + 3];
+    bufWriteMetadata metadata3[1];
+
+    replyIOV reply3;
+    initReplyIOV(c, iovmax, iov_arr3, prefixes3, crlf, &reply3);
+    addBufferToReplyIOV(c->buf, c->bufpos, &reply3, &metadata3[0]);
+    TEST_ASSERT(reply3.iov_len_total == total_len - 5);
+    TEST_ASSERT((*(char*)reply3.iov[0].iov_base) == 'e');
+
+    /* Test 6: Last written buf/pos/data_len after 3rd invocation */
+    saveLastWrittenBuf(c, metadata3, 1, reply3.iov_len_total, reply3.iov_len_total); /* everything has been written */
+    TEST_ASSERT(c->io_last_written_buf == c->buf);
+    TEST_ASSERT(c->io_last_written_bufpos == c->bufpos);
+    TEST_ASSERT(c->io_last_written_data_len == (size_t)total_len);
+
+    decrRefCount(obj);
+    decrRefCount(obj);
+
+    freeReplyOffloadClient(c);
+
+    return 0;
+}
\ No newline at end of file
diff --git a/tests/support/server.tcl b/tests/support/server.tcl
index 7257339042..fb057e0daf 100644
--- a/tests/support/server.tcl
+++ b/tests/support/server.tcl
@@ -246,6 +246,11 @@ proc tags_acceptable {tags err_return} {
         return 0
     }
 
+    if {$::reply_offload && [lsearch $tags "reply-offload:skip"] >= 0} {
+        set err "Not supported in reply-offload mode"
+        return 0
+    }
+
     if {$::tcl_version < 8.6 && [lsearch $tags "ipv6"] >= 0} {
         set err "TCL version is too low and does not support this"
         return 0
@@ -513,6 +518,10 @@ proc start_server {options {code undefined}} {
         dict set config "events-per-io-thread" 0
     }
 
+    if {$::reply_offload} {
+        dict set config "reply-offload" "yes"
+    }
+
     foreach line $data {
         if {[string length $line] > 0 && [string index $line 0] ne "#"} {
             set elements [split $line " "]
diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl
index 1f0658071a..8e74eba38d 100644
--- a/tests/test_helper.tcl
+++ b/tests/test_helper.tcl
@@ -55,6 +55,7 @@ set ::valgrind 0
 set ::durable 0
 set ::tls 0
 set ::io_threads 0
+set ::reply_offload 0
 set ::tls_module 0
 set ::stack_logging 0
 set ::verbose 0
@@ -596,6 +597,7 @@ proc print_help_screen {} {
         "--wait-server      Wait after server is started (so that you can attach a debugger)."
         "--dump-logs        Dump server log on test failure."
         "--io-threads       Run tests with IO threads."
+        "--reply-offload    Run tests with reply offload enabled."
         "--tls              Run tests in TLS mode."
         "--tls-module       Run tests in TLS mode with Valkey module."
         "--host <addr>      Run tests against an external host."
@@ -656,6 +658,8 @@ for {set j 0} {$j < [llength $argv]} {incr j} {
         set ::quiet 1
     } elseif {$opt eq {--io-threads}} {
         set ::io_threads 1
+    } elseif {$opt eq {--reply-offload}} {
+        set ::reply_offload 1
     } elseif {$opt eq {--tls} || $opt eq {--tls-module}} {
         package require tls 1.6
         set ::tls 1
diff --git a/tests/unit/client-eviction.tcl b/tests/unit/client-eviction.tcl
index ceeb20f7b6..4140b6d05e 100644
--- a/tests/unit/client-eviction.tcl
+++ b/tests/unit/client-eviction.tcl
@@ -1,4 +1,4 @@
-tags {"external:skip logreqres:skip"} {
+tags {"external:skip logreqres:skip reply-offload:skip"} {
 
 # Get info about a server client connection:
 # name - name of client we want to query
diff --git a/tests/unit/cluster/slot-stats.tcl b/tests/unit/cluster/slot-stats.tcl
index 99f9c1c03a..2034a51d97 100644
--- a/tests/unit/cluster/slot-stats.tcl
+++ b/tests/unit/cluster/slot-stats.tcl
@@ -524,9 +524,12 @@ start_cluster 1 0 {tags {external:skip cluster}} {
         R 0 SET $key value
         # +OK\r\n --> 5 bytes
 
+        R 0 GET $key
+        # $3\r\nvalue\r\n -> 11 bytes
+
         set expected_slot_stats [
             dict create $key_slot [
-                dict create network-bytes-out 5
+                dict create network-bytes-out 16
             ]
         ]
         set slot_stats [R 0 CLUSTER SLOT-STATS SLOTSRANGE 0 16383]
diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl
index e50faba62b..39bbf64081 100644
--- a/tests/unit/info.tcl
+++ b/tests/unit/info.tcl
@@ -403,7 +403,7 @@ start_server {tags {"info" "external:skip"}} {
             set info [r info stats]
             assert_equal [getInfoProperty $info client_output_buffer_limit_disconnections] {1}
             r config set client-output-buffer-limit $org_outbuf_limit
-        } {OK} {logreqres:skip} ;# same as obuf-limits.tcl, skip logreqres
+        } {OK} {logreqres:skip reply-offload:skip} ;# same as obuf-limits.tcl, skip logreqres
 
         test {clients: pubsub clients} {
             set info [r info clients]
diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl
index 5b76f44645..504b787bda 100644
--- a/tests/unit/maxmemory.tcl
+++ b/tests/unit/maxmemory.tcl
@@ -1,4 +1,4 @@
-start_server {tags {"maxmemory" "external:skip"}} {
+start_server {tags {"maxmemory external:skip reply-offload:skip"}} {
     r config set maxmemory 11mb
     r config set maxmemory-policy allkeys-lru
     set server_pid [s process_id]
diff --git a/tests/unit/obuf-limits.tcl b/tests/unit/obuf-limits.tcl
index b0fd184afe..1f391dfd73 100644
--- a/tests/unit/obuf-limits.tcl
+++ b/tests/unit/obuf-limits.tcl
@@ -1,4 +1,4 @@
-start_server {tags {"obuf-limits external:skip logreqres:skip"}} {
+start_server {tags {"obuf-limits external:skip logreqres:skip reply-offload:skip"}} {
     test {CONFIG SET client-output-buffer-limit} {
         set oldval [lindex [r config get client-output-buffer-limit] 1]
 
diff --git a/tests/unit/replybufsize.tcl b/tests/unit/replybufsize.tcl
index ae3b914ea6..4929fa832f 100644
--- a/tests/unit/replybufsize.tcl
+++ b/tests/unit/replybufsize.tcl
@@ -8,7 +8,7 @@ proc get_reply_buffer_size {cname} {
     return $rbufsize
 }
 
-start_server {tags {"replybufsize"}} {
+start_server {tags {"replybufsize reply-offload:skip"}} {
     
     test {verify reply buffer limits} {
         # In order to reduce test time we can set the peak reset time very low
diff --git a/valkey.conf b/valkey.conf
index e23aea39de..3cee81c2c3 100644
--- a/valkey.conf
+++ b/valkey.conf
@@ -1439,6 +1439,12 @@ lazyfree-lazy-user-flush yes
 #
 # prefetch-batch-max-size 16
 #
+# For use cases where command replies include Bulk strings (e.g. GET, MGET)
+# reply offload can be enabled to eliminate espensive memory access
+# and redundant data copy performed by main thread
+#
+# reply-offload yes
+#
 # NOTE:
 # 1. The 'io-threads-do-reads' config is deprecated and has no effect. Please
 # avoid using this config if possible.