You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
D 12-16 15:00:45 authentication.py:416] Key sky-ssh-keys does not exist in the cluster, creating it...
�[?25hTraceback (most recent call last):
File "/Users/romilb/tools/anaconda3/bin/sky", line 8, in <module>
sys.exit(cli())
File "/Users/romilb/tools/anaconda3/lib/python3.9/site-packages/click/core.py", line 1157, in __call__
return self.main(*args, **kwargs)
File "/Users/romilb/tools/anaconda3/lib/python3.9/site-packages/click/core.py", line 1078, in main
rv = self.invoke(ctx)
File "/Users/romilb/Romil/Berkeley/Research/sky-experiments/sky/utils/common_utils.py", line 366, in _record
return f(*args, **kwargs)
File "/Users/romilb/Romil/Berkeley/Research/sky-experiments/sky/cli.py", line 838, in invoke
return super().invoke(ctx)
File "/Users/romilb/tools/anaconda3/lib/python3.9/site-packages/click/core.py", line 1688, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/Users/romilb/tools/anaconda3/lib/python3.9/site-packages/click/core.py", line 1434, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/Users/romilb/tools/anaconda3/lib/python3.9/site-packages/click/core.py", line 783, in invoke
return __callback(*args, **kwargs)
File "/Users/romilb/Romil/Berkeley/Research/sky-experiments/sky/utils/common_utils.py", line 386, in _record
return f(*args, **kwargs)
File "/Users/romilb/Romil/Berkeley/Research/sky-experiments/sky/cli.py", line 1159, in launch
_launch_with_confirm(task,
File "/Users/romilb/Romil/Berkeley/Research/sky-experiments/sky/cli.py", line 628, in _launch_with_confirm
sky.launch(
File "/Users/romilb/Romil/Berkeley/Research/sky-experiments/sky/utils/common_utils.py", line 386, in _record
return f(*args, **kwargs)
File "/Users/romilb/Romil/Berkeley/Research/sky-experiments/sky/utils/common_utils.py", line 386, in _record
return f(*args, **kwargs)
File "/Users/romilb/Romil/Berkeley/Research/sky-experiments/sky/execution.py", line 529, in launch
return _execute(
File "/Users/romilb/Romil/Berkeley/Research/sky-experiments/sky/execution.py", line 302, in _execute
handle = backend.provision(
File "/Users/romilb/Romil/Berkeley/Research/sky-experiments/sky/utils/common_utils.py", line 386, in _record
return f(*args, **kwargs)
File "/Users/romilb/Romil/Berkeley/Research/sky-experiments/sky/utils/common_utils.py", line 366, in _record
return f(*args, **kwargs)
File "/Users/romilb/Romil/Berkeley/Research/sky-experiments/sky/backends/backend.py", line 84, in provision
return self._provision(task, to_provision, dryrun, stream_logs,
File "/Users/romilb/Romil/Berkeley/Research/sky-experiments/sky/backends/cloud_vm_ray_backend.py", line 2838, in _provision
config_dict = retry_provisioner.provision_with_retries(
File "/Users/romilb/Romil/Berkeley/Research/sky-experiments/sky/utils/common_utils.py", line 386, in _record
return f(*args, **kwargs)
File "/Users/romilb/Romil/Berkeley/Research/sky-experiments/sky/backends/cloud_vm_ray_backend.py", line 2026, in provision_with_retries
config_dict = self._retry_zones(
File "/Users/romilb/Romil/Berkeley/Research/sky-experiments/sky/backends/cloud_vm_ray_backend.py", line 1429, in _retry_zones
config_dict = backend_utils.write_cluster_config(
File "/Users/romilb/Romil/Berkeley/Research/sky-experiments/sky/utils/common_utils.py", line 386, in _record
return f(*args, **kwargs)
File "/Users/romilb/Romil/Berkeley/Research/sky-experiments/sky/backends/backend_utils.py", line 940, in write_cluster_config
_add_auth_to_cluster_config(cloud, tmp_yaml_path)
File "/Users/romilb/Romil/Berkeley/Research/sky-experiments/sky/backends/backend_utils.py", line 1010, in _add_auth_to_cluster_config
config = auth.setup_kubernetes_authentication(config)
File "/Users/romilb/Romil/Berkeley/Research/sky-experiments/sky/authentication.py", line 418, in setup_kubernetes_authentication
kubernetes.core_api(context).create_namespaced_secret(namespace, secret)
File "/Users/romilb/tools/anaconda3/lib/python3.9/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/Users/romilb/tools/anaconda3/lib/python3.9/site-packages/kubernetes/client/api/core_v1_api.py", line 8232, in create_namespaced_secret
return self.create_namespaced_secret_with_http_info(namespace, body, **kwargs) # noqa: E501
File "/Users/romilb/tools/anaconda3/lib/python3.9/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/Users/romilb/tools/anaconda3/lib/python3.9/site-packages/kubernetes/client/api/core_v1_api.py", line 8331, in create_namespaced_secret_with_http_info
return self.api_client.call_api(
File "/Users/romilb/tools/anaconda3/lib/python3.9/site-packages/kubernetes/client/api_client.py", line 348, in call_api
return self.__call_api(resource_path, method,
File "/Users/romilb/tools/anaconda3/lib/python3.9/site-packages/kubernetes/client/api_client.py", line 180, in __call_api
response_data = self.request(
File "/Users/romilb/tools/anaconda3/lib/python3.9/site-packages/kubernetes/client/api_client.py", line 391, in request
return self.rest_client.POST(url,
File "/Users/romilb/tools/anaconda3/lib/python3.9/site-packages/kubernetes/client/rest.py", line 279, in POST
return self.request("POST", url,
File "/Users/romilb/tools/anaconda3/lib/python3.9/site-packages/kubernetes/client/rest.py", line 238, in request
raise ApiException(http_resp=r)
kubernetes.client.exceptions.ApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'Audit-Id': '4b5b374d-fa7d-49e9-b0f8-81d92d99e889', 'Cache-Control': 'no-cache, private', 'Content-Type': 'application/json', 'X-Kubernetes-Pf-Flowschema-Uid': '586a62d2-264a-45a8-83d9-3681869bba6d', 'X-Kubernetes-Pf-Prioritylevel-Uid': 'abec74e0-9f48-41cc-8f90-7383b1f3eb31', 'Date': 'Mon, 16 Dec 2024 23:00:45 GMT', 'Content-Length': '208'})
HTTP response body: {"kind":"Status","apiVersion":"v1","metadata":{},"status":"Failure","message":"secrets \"sky-ssh-keys\" already exists","reason":"AlreadyExists","details":{"name":"sky-ssh-keys","kind":"secrets"},"code":409}
This is because of race condition in authentication.py:
if kubernetes_utils.check_secret_exists(secret_name, namespace, context):
logger.debug(f'Key {secret_name} exists in the cluster, patching it...')
kubernetes.core_api(context).patch_namespaced_secret(
secret_name, namespace, secret)
else:
logger.debug(
f'Key {secret_name} does not exist in the cluster, creating it...')
kubernetes.core_api(context).create_namespaced_secret(namespace, secret)
Should probably continue if 409 with AlreadyExists is encountered.
- Add error handling for 409 Conflict error during concurrent secret creation
- Patch secret if it was created by another process between check and create
- Fixes#4472
Co-Authored-By: [email protected] <[email protected]>
(cherry picked from commit 1d1875c9d2baedede2934adea5736d59bc8666e5)
Many concurrent
sky launch
fail with this error:This is because of race condition in authentication.py:
Should probably continue if 409 with
AlreadyExists
is encountered.Repro:
pytest tests/test_smoke.py::test_docker_storage_mounts --kubernetes
The text was updated successfully, but these errors were encountered: