From 2a0929b584aaf296fd574cbeed16334fdc09b4ca Mon Sep 17 00:00:00 2001 From: Aylei Date: Fri, 20 Dec 2024 18:07:05 +0800 Subject: [PATCH 1/7] [Tests] fix smoke test race condition on first run Signed-off-by: Aylei --- sky/provision/aws/config.py | 38 +++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/sky/provision/aws/config.py b/sky/provision/aws/config.py index 6a8c77eafed..04f55b93701 100644 --- a/sky/provision/aws/config.py +++ b/sky/provision/aws/config.py @@ -150,6 +150,36 @@ def _get_role(role_name: str): f'{role_name}{colorama.Style.RESET_ALL} from AWS.') raise exc + def _create_instance_profile_if_not_exists(instance_profile_name: str): + try: + iam.meta.client.create_instance_profile( + InstanceProfileName=instance_profile_name) + except aws.botocore_exceptions().ClientError as exc: + if exc.response.get('Error', + {}).get('Code') == 'EntityAlreadyExists': + return + else: + utils.handle_boto_error( + exc, + f'Failed to create instance profile {colorama.Style.BRIGHT}' + f'{instance_profile_name}{colorama.Style.RESET_ALL} in AWS.' + ) + raise exc + + def _create_role_if_not_exists(role_name: str, policy_doc: Dict[str, Any]): + try: + iam.create_role(RoleName=role_name, + AssumeRolePolicyDocument=json.dumps(policy_doc)) + except aws.botocore_exceptions().ClientError as exc: + if exc.response.get('Error', + {}).get('Code') == 'EntityAlreadyExists': + return + else: + utils.handle_boto_error( + exc, f'Failed to create role {colorama.Style.BRIGHT}' + f'{role_name}{colorama.Style.RESET_ALL} in AWS.') + raise exc + instance_profile_name = DEFAULT_SKYPILOT_INSTANCE_PROFILE profile = _get_instance_profile(instance_profile_name) @@ -158,8 +188,7 @@ def _get_role(role_name: str): f'Creating new IAM instance profile {colorama.Style.BRIGHT}' f'{instance_profile_name}{colorama.Style.RESET_ALL} for use as the ' 'default.') - iam.meta.client.create_instance_profile( - InstanceProfileName=instance_profile_name) + _create_instance_profile_if_not_exists(instance_profile_name) profile = _get_instance_profile(instance_profile_name) time.sleep(15) # wait for propagation assert profile is not None, 'Failed to create instance profile' @@ -186,8 +215,9 @@ def _get_role(role_name: str): 'arn:aws:iam::aws:policy/AmazonS3FullAccess', ] - iam.create_role(RoleName=role_name, - AssumeRolePolicyDocument=json.dumps(policy_doc)) + # TODO(aylei): need to check and reconcile the role permission + # in case of external modification. + _create_role_if_not_exists(role_name, policy_doc) role = _get_role(role_name) assert role is not None, 'Failed to create role' From ba0fe3a12eeb363b74c55cd296913d2db15d360f Mon Sep 17 00:00:00 2001 From: Aylei Date: Fri, 27 Dec 2024 19:11:50 +0800 Subject: [PATCH 2/7] fix add role race condition Signed-off-by: Aylei --- sky/provision/aws/config.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/sky/provision/aws/config.py b/sky/provision/aws/config.py index 04f55b93701..15c4ae06613 100644 --- a/sky/provision/aws/config.py +++ b/sky/provision/aws/config.py @@ -180,6 +180,35 @@ def _create_role_if_not_exists(role_name: str, policy_doc: Dict[str, Any]): f'{role_name}{colorama.Style.RESET_ALL} in AWS.') raise exc + def _ensure_instance_profile_role(profile: Any, role: Any): + try: + profile.add_role(RoleName=role.name) + except aws.botocore_exceptions().ClientError as exc: + # AddRoleToInstanceProfile is not idempotent. Adding a role to an + # InstanceProfile that already has an associated role will cause + # LimitExceeded error, even if the two roles are identical. + # see also: https://docs.aws.amazon.com/IAM/latest/APIReference/API_AddRoleToInstanceProfile.html # pylint: disable=line-too-long + if exc.response.get('Error', {}).get('Code') == 'LimitExceeded': + # If the associated role is not the role we expect, error out + # to user instead of silently overriding the role. + if profile.roles and profile.roles[0].name != role.name: + utils.handle_boto_error( + exc, f'The instance profile {profile.name} already has ' + f'an associated role {profile.roles[0].name}, but the ' + f'role {role.name} is not the same as the expected ' + f'role. Please remove the existing role from the ' + f'instance profile and try again.') + raise exc + # If the associated role is the role we expect, do nothing. + return + else: + utils.handle_boto_error( + exc, f'Failed to add role {colorama.Style.BRIGHT}' + f'{role.name}{colorama.Style.RESET_ALL} to instance ' + f'profile {colorama.Style.BRIGHT}{profile.name}' + f'{colorama.Style.RESET_ALL} in AWS.') + raise exc + instance_profile_name = DEFAULT_SKYPILOT_INSTANCE_PROFILE profile = _get_instance_profile(instance_profile_name) @@ -193,6 +222,7 @@ def _create_role_if_not_exists(role_name: str, policy_doc: Dict[str, Any]): time.sleep(15) # wait for propagation assert profile is not None, 'Failed to create instance profile' + # TODO(aylei): check if the role associated is the one we expect. if not profile.roles: role_name = DEFAULT_SKYPILOT_IAM_ROLE role = _get_role(role_name) @@ -247,7 +277,7 @@ def _create_role_if_not_exists(role_name: str, policy_doc: Dict[str, Any]): role.Policy('SkyPilotPassRolePolicy').put( PolicyDocument=json.dumps(skypilot_pass_role_policy_doc)) - profile.add_role(RoleName=role.name) + _ensure_instance_profile_role(profile, role) time.sleep(15) # wait for propagation return {'Arn': profile.arn} From d6faacd792e55f33acc49ce03c28aab7f9539433 Mon Sep 17 00:00:00 2001 From: Aylei Date: Fri, 27 Dec 2024 23:43:33 +0800 Subject: [PATCH 3/7] address review comments Signed-off-by: Aylei --- sky/provision/aws/config.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/sky/provision/aws/config.py b/sky/provision/aws/config.py index 15c4ae06613..2a96e77a534 100644 --- a/sky/provision/aws/config.py +++ b/sky/provision/aws/config.py @@ -180,6 +180,17 @@ def _create_role_if_not_exists(role_name: str, policy_doc: Dict[str, Any]): f'{role_name}{colorama.Style.RESET_ALL} in AWS.') raise exc + def _check_instance_profile_role(profile: Any, role_name: str): + if profile.roles and profile.roles[0].name != role_name: + # If the associated role is not the role we expect, error out + # to user instead of silently overriding the role. + logger.fatal(f'The instance profile {profile.name} already has ' + f'an associated role {profile.roles[0].name}, but the ' + f'role {role.name} is not the same as the expected ' + f'role. Please remove the existing role from the ' + f'instance profile and try again.') + raise SystemExit(1) + def _ensure_instance_profile_role(profile: Any, role: Any): try: profile.add_role(RoleName=role.name) @@ -189,17 +200,7 @@ def _ensure_instance_profile_role(profile: Any, role: Any): # LimitExceeded error, even if the two roles are identical. # see also: https://docs.aws.amazon.com/IAM/latest/APIReference/API_AddRoleToInstanceProfile.html # pylint: disable=line-too-long if exc.response.get('Error', {}).get('Code') == 'LimitExceeded': - # If the associated role is not the role we expect, error out - # to user instead of silently overriding the role. - if profile.roles and profile.roles[0].name != role.name: - utils.handle_boto_error( - exc, f'The instance profile {profile.name} already has ' - f'an associated role {profile.roles[0].name}, but the ' - f'role {role.name} is not the same as the expected ' - f'role. Please remove the existing role from the ' - f'instance profile and try again.') - raise exc - # If the associated role is the role we expect, do nothing. + _check_instance_profile_role(profile, role) return else: utils.handle_boto_error( @@ -222,9 +223,11 @@ def _ensure_instance_profile_role(profile: Any, role: Any): time.sleep(15) # wait for propagation assert profile is not None, 'Failed to create instance profile' - # TODO(aylei): check if the role associated is the one we expect. - if not profile.roles: - role_name = DEFAULT_SKYPILOT_IAM_ROLE + role_name = DEFAULT_SKYPILOT_IAM_ROLE + if profile.roles: + _check_instance_profile_role(profile, role_name) + else: + # there is no role associated, ensure the role and associate role = _get_role(role_name) if role is None: logger.info( From 096e59a86b16fdf4f281bba45a1b943239882d24 Mon Sep 17 00:00:00 2001 From: Aylei Date: Mon, 30 Dec 2024 09:57:18 +0800 Subject: [PATCH 4/7] address review comments Signed-off-by: Aylei --- sky/provision/aws/config.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sky/provision/aws/config.py b/sky/provision/aws/config.py index 2a96e77a534..3f4ae62349f 100644 --- a/sky/provision/aws/config.py +++ b/sky/provision/aws/config.py @@ -182,11 +182,9 @@ def _create_role_if_not_exists(role_name: str, policy_doc: Dict[str, Any]): def _check_instance_profile_role(profile: Any, role_name: str): if profile.roles and profile.roles[0].name != role_name: - # If the associated role is not the role we expect, error out - # to user instead of silently overriding the role. logger.fatal(f'The instance profile {profile.name} already has ' f'an associated role {profile.roles[0].name}, but the ' - f'role {role.name} is not the same as the expected ' + f'role {role_name} is not the same as the expected ' f'role. Please remove the existing role from the ' f'instance profile and try again.') raise SystemExit(1) From 344a5dba6daa9e3c7909cc9c6795553fa79dac66 Mon Sep 17 00:00:00 2001 From: Aylei Date: Thu, 2 Jan 2025 20:24:47 +0800 Subject: [PATCH 5/7] Update sky/provision/aws/config.py Co-authored-by: Zhanghao Wu --- sky/provision/aws/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/provision/aws/config.py b/sky/provision/aws/config.py index 3f4ae62349f..b7c50a1558f 100644 --- a/sky/provision/aws/config.py +++ b/sky/provision/aws/config.py @@ -150,7 +150,7 @@ def _get_role(role_name: str): f'{role_name}{colorama.Style.RESET_ALL} from AWS.') raise exc - def _create_instance_profile_if_not_exists(instance_profile_name: str): + def _create_instance_profile_if_not_exists(instance_profile_name: str) -> None: try: iam.meta.client.create_instance_profile( InstanceProfileName=instance_profile_name) From 77065be2ef993892628a6cbb055cb55736542e74 Mon Sep 17 00:00:00 2001 From: Aylei Date: Thu, 2 Jan 2025 21:15:08 +0800 Subject: [PATCH 6/7] address review comments Signed-off-by: Aylei --- sky/provision/aws/config.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sky/provision/aws/config.py b/sky/provision/aws/config.py index b7c50a1558f..4c5edac1096 100644 --- a/sky/provision/aws/config.py +++ b/sky/provision/aws/config.py @@ -150,7 +150,8 @@ def _get_role(role_name: str): f'{role_name}{colorama.Style.RESET_ALL} from AWS.') raise exc - def _create_instance_profile_if_not_exists(instance_profile_name: str) -> None: + def _create_instance_profile_if_not_exists(instance_profile_name: str + ) -> None: try: iam.meta.client.create_instance_profile( InstanceProfileName=instance_profile_name) @@ -166,7 +167,8 @@ def _create_instance_profile_if_not_exists(instance_profile_name: str) -> None: ) raise exc - def _create_role_if_not_exists(role_name: str, policy_doc: Dict[str, Any]): + def _create_role_if_not_exists(role_name: str, policy_doc: Dict[str, Any] + ) -> None: try: iam.create_role(RoleName=role_name, AssumeRolePolicyDocument=json.dumps(policy_doc)) @@ -180,7 +182,7 @@ def _create_role_if_not_exists(role_name: str, policy_doc: Dict[str, Any]): f'{role_name}{colorama.Style.RESET_ALL} in AWS.') raise exc - def _check_instance_profile_role(profile: Any, role_name: str): + def _check_instance_profile_role(profile: Any, role_name: str) -> None: if profile.roles and profile.roles[0].name != role_name: logger.fatal(f'The instance profile {profile.name} already has ' f'an associated role {profile.roles[0].name}, but the ' From 4635d841aea6015b681396943780da5c6dae40ee Mon Sep 17 00:00:00 2001 From: Aylei Date: Thu, 2 Jan 2025 22:18:58 +0800 Subject: [PATCH 7/7] format Signed-off-by: Aylei --- sky/provision/aws/config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sky/provision/aws/config.py b/sky/provision/aws/config.py index 4c5edac1096..903c854feae 100644 --- a/sky/provision/aws/config.py +++ b/sky/provision/aws/config.py @@ -150,8 +150,8 @@ def _get_role(role_name: str): f'{role_name}{colorama.Style.RESET_ALL} from AWS.') raise exc - def _create_instance_profile_if_not_exists(instance_profile_name: str - ) -> None: + def _create_instance_profile_if_not_exists( + instance_profile_name: str) -> None: try: iam.meta.client.create_instance_profile( InstanceProfileName=instance_profile_name) @@ -167,8 +167,8 @@ def _create_instance_profile_if_not_exists(instance_profile_name: str ) raise exc - def _create_role_if_not_exists(role_name: str, policy_doc: Dict[str, Any] - ) -> None: + def _create_role_if_not_exists(role_name: str, + policy_doc: Dict[str, Any]) -> None: try: iam.create_role(RoleName=role_name, AssumeRolePolicyDocument=json.dumps(policy_doc))