Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-16878 pool: Reduce unexpected DER_NO_SERVICEs #15665

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 63 additions & 34 deletions src/pool/srv_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ struct pool_svc {
rdb_path_t ps_handles; /* pool handle KVS */
rdb_path_t ps_user; /* pool user attributes KVS */
rdb_path_t ps_ops; /* metadata ops KVS */
int ps_error; /* of the persistent kind */
struct pool_svc_events ps_events;
uint32_t ps_global_version;
int ps_svc_rf;
Expand Down Expand Up @@ -2329,8 +2330,8 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc)

rc = ds_pool_iv_prop_update(svc->ps_pool, prop);
if (rc) {
D_ERROR("ds_pool_iv_prop_update failed: " DF_RC "\n", DP_RC(rc));
D_GOTO(out, rc);
DL_ERROR(rc, DF_UUID ": ds_pool_iv_prop_update failed", DP_UUID(svc->ps_uuid));
goto out;
}

if (!uuid_is_null(svc->ps_pool->sp_srv_cont_hdl)) {
Expand All @@ -2346,11 +2347,10 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc)
uuid_copy(svc->ps_pool->sp_srv_pool_hdl, pool_hdl_uuid);
}

rc = ds_pool_iv_srv_hdl_update(svc->ps_pool, pool_hdl_uuid,
cont_hdl_uuid);
if (rc) {
D_ERROR("ds_pool_iv_srv_hdl_update failed: " DF_RC "\n", DP_RC(rc));
D_GOTO(out, rc);
rc = ds_pool_iv_srv_hdl_update(svc->ps_pool, pool_hdl_uuid, cont_hdl_uuid);
if (rc != 0) {
DL_ERROR(rc, DF_UUID ": ds_pool_iv_srv_hdl_update failed", DP_UUID(svc->ps_uuid));
goto out;
}

/* resume pool upgrade if needed */
Expand Down Expand Up @@ -2385,26 +2385,31 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc)
D_FREE(map_buf);
if (prop != NULL)
daos_prop_free(prop);
if (rc < 0)
ds_pool_failed_add(svc->ps_uuid, rc);
else if (rc == 0)
ds_pool_failed_remove(svc->ps_uuid);
if (rc == -DER_DF_INCOMPT) {
DL_ERROR(rc, DF_UUID ": stepping up with error", DP_UUID(svc->ps_uuid));
svc->ps_error = rc;
rc = 0;
}
return rc;
}

static void
pool_svc_step_down_cb(struct ds_rsvc *rsvc)
{
struct pool_svc *svc = pool_svc_obj(rsvc);
d_rank_t rank = dss_self_rank();
struct pool_svc *svc = pool_svc_obj(rsvc);
d_rank_t rank = dss_self_rank();

pool_svc_step_down_metrics(svc);
fini_events(svc);
sched_cancel_and_wait(&svc->ps_reconf_sched);
sched_cancel_and_wait(&svc->ps_rfcheck_sched);
ds_cont_svc_step_down(svc->ps_cont_svc);
if (svc->ps_error == 0) {
pool_svc_step_down_metrics(svc);
fini_events(svc);
sched_cancel_and_wait(&svc->ps_reconf_sched);
sched_cancel_and_wait(&svc->ps_rfcheck_sched);
ds_cont_svc_step_down(svc->ps_cont_svc);
} else {
svc->ps_error = 0;
}

DS_POOL_NOTE_PRINT(DF_UUID": rank %u no longer pool service leader "DF_U64"\n",
DS_POOL_NOTE_PRINT(DF_UUID ": rank %u no longer pool service leader " DF_U64 "\n",
DP_UUID(svc->ps_uuid), rank, svc->ps_rsvc.s_term);
}

Expand Down Expand Up @@ -2476,12 +2481,13 @@ ds_pool_rsvc_class_unregister(void)
ds_rsvc_class_unregister(DS_RSVC_CLASS_POOL);
}

/* Use pool_svc_lookup_leader instead. */
static int
pool_svc_lookup(uuid_t uuid, struct pool_svc **svcp)
{
struct ds_rsvc *rsvc;
d_iov_t id;
int rc;
d_iov_t id;
int rc;

d_iov_set(&id, uuid, sizeof(uuid_t));
rc = ds_rsvc_lookup(DS_RSVC_CLASS_POOL, &id, &rsvc);
Expand All @@ -2492,25 +2498,40 @@ pool_svc_lookup(uuid_t uuid, struct pool_svc **svcp)
}

static int
pool_svc_lookup_leader(uuid_t uuid, struct pool_svc **svcp,
struct rsvc_hint *hint)
pool_svc_lookup_leader(uuid_t uuid, struct pool_svc **svcp, struct rsvc_hint *hint)
{
struct ds_rsvc *rsvc;
d_iov_t id;
int rc;
struct ds_rsvc *rsvc;
d_iov_t id;
struct pool_svc *svc;
int rc;

rc = ds_pool_failed_lookup(uuid);
if (rc) {
D_ERROR(DF_UUID": failed to start: "DF_RC"\n",
DP_UUID(uuid), DP_RC(rc));
if (rc != 0) {
D_DEBUG(DB_MD, DF_UUID ": failed: " DF_RC "\n", DP_UUID(uuid), DP_RC(rc));
return -DER_NO_SERVICE;
}

d_iov_set(&id, uuid, sizeof(uuid_t));
rc = ds_rsvc_lookup_leader(DS_RSVC_CLASS_POOL, &id, &rsvc, hint);
if (rc != 0)
return rc;
*svcp = pool_svc_obj(rsvc);

/*
* Checking svc->ps_error without confirming our leadership via
* rdb_raft_verify_leadership may cause some requests to get
* unnecessary errors, if there is a newer leader whose svc->ps_error
* is zero and is able to serve those requests. Such a state won't last
* much longer than an election timeout though, because we will step
* down due to inability to maintain a majority lease.
*/
svc = pool_svc_obj(rsvc);
if (svc->ps_error != 0) {
rc = svc->ps_error;
ds_rsvc_put_leader(rsvc);
return rc;
}

*svcp = svc;
return 0;
}

Expand Down Expand Up @@ -2582,6 +2603,7 @@ int ds_pool_failed_add(uuid_t uuid, int rc)
uuid_copy(psf->psf_uuid, uuid);
psf->psf_error = rc;
d_list_add_tail(&psf->psf_link, &pool_svc_failed_list);
DL_ERROR(rc, DF_UUID ": added to list of failed pools", DP_UUID(uuid));
out:
D_RWLOCK_UNLOCK(&psfl_rwlock);
return ret;
Expand All @@ -2596,6 +2618,8 @@ void ds_pool_failed_remove(uuid_t uuid)
d_list_for_each_entry_safe(psf, tmp, &pool_svc_failed_list, psf_link) {
if (uuid_compare(psf->psf_uuid, uuid) == 0) {
d_list_del_init(&psf->psf_link);
DL_INFO(psf->psf_error, DF_UUID ": removed from list of failed pools",
DP_UUID(uuid));
D_FREE(psf);
break;
}
Expand Down Expand Up @@ -3431,7 +3455,7 @@ ds_pool_svc_ops_lookup(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid
int rc = 0;

if (!svc) {
rc = pool_svc_lookup(pool_uuid, &svc);
rc = pool_svc_lookup_leader(pool_uuid, &svc, NULL /* hint */);
if (rc != 0) {
DL_ERROR(rc, "pool_svc lookup failed");
goto out;
Expand Down Expand Up @@ -3479,7 +3503,7 @@ ds_pool_svc_ops_lookup(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid
D_FREE(op_key_enc.iov_buf);
out_svc:
if (need_put_svc)
pool_svc_put(svc);
pool_svc_put_leader(svc);
out:
if (rc == 0) {
*is_dup = duplicate;
Expand Down Expand Up @@ -3526,7 +3550,7 @@ ds_pool_svc_ops_save(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid_t
int rc = 0;

if (!svc) {
rc = pool_svc_lookup(pool_uuid, &svc);
rc = pool_svc_lookup_leader(pool_uuid, &svc, NULL /* hint */);
if (rc != 0) {
DL_ERROR(rc, "pool_svc lookup failed");
goto out;
Expand Down Expand Up @@ -3588,7 +3612,7 @@ ds_pool_svc_ops_save(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid_t
D_FREE(op_key_enc.iov_buf);
out_svc:
if (need_put_svc)
pool_svc_put(svc);
pool_svc_put_leader(svc);
out:
return rc;
}
Expand Down Expand Up @@ -3681,6 +3705,11 @@ ds_pool_create_handler(crt_rpc_t *rpc)
ABT_rwlock_wrlock(svc->ps_lock);
ds_cont_wrlock_metadata(svc->ps_cont_svc);

if (svc->ps_error != 0) {
rc = svc->ps_error;
goto out_tx;
}

/* See if the DB has already been initialized. */
d_iov_set(&value, NULL /* buf */, 0 /* size */);
rc = rdb_tx_lookup(&tx, &svc->ps_root, &ds_pool_prop_map_buffer,
Expand Down
Loading