Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Tests] fix smoke test race condition on first run #4494

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 65 additions & 5 deletions sky/provision/aws/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,65 @@ def _get_role(role_name: str):
f'{role_name}{colorama.Style.RESET_ALL} from AWS.')
raise exc

def _create_instance_profile_if_not_exists(instance_profile_name: str):
aylei marked this conversation as resolved.
Show resolved Hide resolved
try:
iam.meta.client.create_instance_profile(
InstanceProfileName=instance_profile_name)
except aws.botocore_exceptions().ClientError as exc:
if exc.response.get('Error',
{}).get('Code') == 'EntityAlreadyExists':
return
else:
utils.handle_boto_error(
exc,
f'Failed to create instance profile {colorama.Style.BRIGHT}'
f'{instance_profile_name}{colorama.Style.RESET_ALL} in AWS.'
)
raise exc

def _create_role_if_not_exists(role_name: str, policy_doc: Dict[str, Any]):
try:
iam.create_role(RoleName=role_name,
AssumeRolePolicyDocument=json.dumps(policy_doc))
except aws.botocore_exceptions().ClientError as exc:
if exc.response.get('Error',
{}).get('Code') == 'EntityAlreadyExists':
return
else:
utils.handle_boto_error(
exc, f'Failed to create role {colorama.Style.BRIGHT}'
f'{role_name}{colorama.Style.RESET_ALL} in AWS.')
raise exc

def _ensure_instance_profile_role(profile: Any, role: Any):
try:
profile.add_role(RoleName=role.name)
except aws.botocore_exceptions().ClientError as exc:
# AddRoleToInstanceProfile is not idempotent. Adding a role to an
# InstanceProfile that already has an associated role will cause
# LimitExceeded error, even if the two roles are identical.
# see also: https://docs.aws.amazon.com/IAM/latest/APIReference/API_AddRoleToInstanceProfile.html # pylint: disable=line-too-long
if exc.response.get('Error', {}).get('Code') == 'LimitExceeded':
# If the associated role is not the role we expect, error out
# to user instead of silently overriding the role.
if profile.roles and profile.roles[0].name != role.name:
utils.handle_boto_error(
exc, f'The instance profile {profile.name} already has '
f'an associated role {profile.roles[0].name}, but the '
f'role {role.name} is not the same as the expected '
f'role. Please remove the existing role from the '
f'instance profile and try again.')
raise exc
# If the associated role is the role we expect, do nothing.
return
else:
utils.handle_boto_error(
exc, f'Failed to add role {colorama.Style.BRIGHT}'
f'{role.name}{colorama.Style.RESET_ALL} to instance '
f'profile {colorama.Style.BRIGHT}{profile.name}'
f'{colorama.Style.RESET_ALL} in AWS.')
raise exc

instance_profile_name = DEFAULT_SKYPILOT_INSTANCE_PROFILE
profile = _get_instance_profile(instance_profile_name)

Expand All @@ -158,12 +217,12 @@ def _get_role(role_name: str):
f'Creating new IAM instance profile {colorama.Style.BRIGHT}'
f'{instance_profile_name}{colorama.Style.RESET_ALL} for use as the '
'default.')
iam.meta.client.create_instance_profile(
InstanceProfileName=instance_profile_name)
_create_instance_profile_if_not_exists(instance_profile_name)
profile = _get_instance_profile(instance_profile_name)
time.sleep(15) # wait for propagation
assert profile is not None, 'Failed to create instance profile'

# TODO(aylei): check if the role associated is the one we expect.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You already checked in _ensure_instance_profile_role. Can we remove the TODO?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The check in _ensure_instance_profile_role only occurs when race happens, i.e. profile.roles is empty and the subsequent adding role action encounters LimitExceeded error. We still need to check the role in other cases.

I found this TODO is trivial, just added the missing part and removed the TODO.

if not profile.roles:
role_name = DEFAULT_SKYPILOT_IAM_ROLE
role = _get_role(role_name)
Expand All @@ -186,8 +245,9 @@ def _get_role(role_name: str):
'arn:aws:iam::aws:policy/AmazonS3FullAccess',
]

iam.create_role(RoleName=role_name,
AssumeRolePolicyDocument=json.dumps(policy_doc))
# TODO(aylei): need to check and reconcile the role permission
# in case of external modification.
_create_role_if_not_exists(role_name, policy_doc)
role = _get_role(role_name)
assert role is not None, 'Failed to create role'

Expand Down Expand Up @@ -217,7 +277,7 @@ def _get_role(role_name: str):
role.Policy('SkyPilotPassRolePolicy').put(
PolicyDocument=json.dumps(skypilot_pass_role_policy_doc))

profile.add_role(RoleName=role.name)
_ensure_instance_profile_role(profile, role)
time.sleep(15) # wait for propagation
return {'Arn': profile.arn}

Expand Down
Loading