Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add rollback detection #235

Draft
wants to merge 7 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 45 additions & 33 deletions ecs_deploy/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,13 +155,15 @@ def deploy(cluster, service, tag, image, command, health_check, cpu, memory, mem
ignore_warnings=ignore_warnings,
sleep_time=sleep_time
)

if deployment.rollback:
slack.notify_failure(cluster, "Rollback", service=service)
exit(100)
except TaskPlacementError as e:
slack.notify_failure(cluster, str(e), service=service)
if rollback:
click.secho('%s\n' % str(e), fg='red', err=True)
rollback_task_definition(deployment, td, new_td, sleep_time=sleep_time)
exit(1)
exit(100)
else:
raise

Expand Down Expand Up @@ -532,21 +534,23 @@ def wait_for_finish(action, timeout, title, success_message, failure_message,
waiting = True

while waiting and datetime.now() < waiting_timeout:
click.secho('.', nl=False)
service = action.get_service()
inspected_until = inspect_errors(
inspected_until = inspect_events(
service=service,
failure_message=failure_message,
ignore_warnings=ignore_warnings,
since=inspected_until,
timeout=False
)

waiting = not action.is_deployed(service)
if action.primary_deployment_updated(service):
click.secho(f"{action.primary_deployment.status_message}", nl=True)

if waiting:
sleep(sleep_time)

inspect_errors(
inspect_events(
service=service,
failure_message=failure_message,
ignore_warnings=ignore_warnings,
Expand All @@ -556,7 +560,8 @@ def wait_for_finish(action, timeout, title, success_message, failure_message,

click.secho('\n%s' % success_message, fg='green')
click.secho('Duration: %s sec\n' % (datetime.now() - start_timestamp).seconds)

if action.rollback:
click.secho('Rollback complete', fg='green')

def deploy_task_definition(deployment, task_definition, title, success_message,
failure_message, timeout, deregister,
Expand Down Expand Up @@ -669,38 +674,29 @@ def print_diff(task_definition, title='Updating task definition'):
click.secho('')


def inspect_errors(service, failure_message, ignore_warnings, since, timeout):
def inspect_events(service, failure_message, ignore_warnings, since, timeout):
error = False
last_error_timestamp = since
warnings = service.get_warnings(since)
for timestamp in warnings:
message = warnings[timestamp]
click.secho('')
if ignore_warnings:
last_error_timestamp = timestamp
click.secho(
'%s\nWARNING: %s' % (timestamp, message),
fg='yellow',
err=False
)
last_event_timestamp = since
events = service.get_events(since)
for timestamp in events:
message = events[timestamp]
last_event_timestamp = timestamp
message_lower = message.lower()

if 'unable' in message_lower:
error = False if ignore_warnings else True
level = 'ERROR' if error else 'WARNING'
click.secho('Continuing.', nl=False)
event_log(timestamp, message, level)
elif 'rolling back' in message_lower:
event_log(timestamp, message, 'WARNING')
else:
click.secho(
'%s\nERROR: %s\n' % (timestamp, message),
fg='red',
err=True
)
error = True
event_log(timestamp, message, 'INFO')

if service.older_errors:
click.secho('')
click.secho('Older errors', fg='yellow', err=True)
event_log(timestamp, 'Older errors', 'WARNING')
for timestamp in service.older_errors:
click.secho(
text='%s\n%s\n' % (timestamp, service.older_errors[timestamp]),
fg='yellow',
err=True
)
event_log(timestamp, service.older_errors[timestamp], 'WARNING')

if timeout:
error = True
Expand All @@ -711,8 +707,24 @@ def inspect_errors(service, failure_message, ignore_warnings, since, timeout):
if error:
raise TaskPlacementError(failure_message)

return last_error_timestamp
return last_event_timestamp

def event_log(timestamp, message, level):
"""
Helper function to display a message with consistent formatting and color coding.
"""
color_map = {
'INFO': 'green',
'WARNING': 'yellow',
'ERROR': 'red'
}
color = color_map.get(level, 'white') # Default to white if the level is unknown

click.secho(
f'{timestamp} {level}: {message}',
fg=color,
err=(level == 'ERROR')
)

ecs.add_command(deploy)
ecs.add_command(scale)
Expand Down
48 changes: 42 additions & 6 deletions ecs_deploy/ecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,18 @@ class EcsDeployment(dict):
ROLLOUT_STATE_FAILED = u'FAILED'
ROLLOUT_STATE_COMPLETED = u'COMPLETED'

def __eq__(self, other):
return self.get(u'status') == other.get(u'status') and \
other.get('runningCount') == self.get('runningCount') and \
other.get('pendingCount') == self.get('pendingCount') and \
other.get('failedTasks') == self.get('failedTasks') and \
other.get('rolloutState') == self.get('rolloutState') and \
other.get('rolloutStateReason') == self.get('rolloutStateReason')

@property
def status_message(self):
return f"{self.get('rolloutState')} : {self.rollout_state_reason} Running: {self.get('runningCount')} / Pending: {self.get('pendingCount')} / Failed: {self.get('failedTasks')}"

@property
def is_primary(self):
return self.get(u'status') == self.STATUS_PRIMARY
Expand All @@ -217,6 +229,10 @@ def is_active(self):
def has_failed(self):
return self.get(u'rolloutState') == self.ROLLOUT_STATE_FAILED

@property
def is_rollback(self):
return "circuit breaker: rolling back" in self.get(u'rolloutStateReason')

@property
def has_completed(self):
return self.get(u'rolloutState') == self.ROLLOUT_STATE_COMPLETED
Expand Down Expand Up @@ -298,15 +314,17 @@ def older_errors(self):
)

def get_warnings(self, since=None, until=None):
events = self.get_events(since, until)
return {k: v for k, v in events.items() if 'unable' in v}

def get_events(self, since=None, until=None):
since = since or self.deployment_created_at
until = until or datetime.now(tz=tzlocal())
errors = {}
events = {}
for event in self.get(u'events'):
if u'unable' not in event[u'message']:
continue
if since < event[u'createdAt'] < until:
errors[event[u'createdAt']] = event[u'message']
return errors
events[event[u'createdAt']] = event[u'message']
return events


class EcsTaskDefinition(object):
Expand Down Expand Up @@ -1308,6 +1326,9 @@ def __init__(self, client, cluster_name, service_name):
try:
if service_name:
self._service = self.get_service()
self.primary_deployment = self._service.primary_deployment
self.active_deployment = self._service.active_deployment
self.rollback = False
except IndexError:
raise EcsConnectionError(
u'An error occurred when calling the DescribeServices '
Expand Down Expand Up @@ -1372,8 +1393,23 @@ def update_service(self, service, desired_count=None):
task_definition=service.task_definition
)
return EcsService(self._cluster_name, response[u'service'])

def primary_deployment_updated(self, service):
if service.primary_deployment != self.primary_deployment:
self.primary_deployment = service.primary_deployment
return True

def active_deployment_updated(self, service):
if service.active_deployment != self.active_deployment:
self.active_deployment = service.active_deployment
return True

def deployment_status_updated(self, service):
return self.primary_deployment_updated(service) or self.active_deployment_updated(service)

def is_deployed(self, service):
if service.primary_deployment.is_rollback:
self.rollback = True
if service.primary_deployment.has_failed:
raise EcsDeploymentError(u'Deployment Failed! ' + service.primary_deployment.rollout_state_reason)
if service.primary_deployment.failed_tasks > 0 and service.primary_deployment.failed_tasks != self.FAILED_TASKS:
Expand All @@ -1392,7 +1428,7 @@ def is_deployed(self, service):
service=service,
task_arns=running_tasks[u'taskArns']
)
return service.desired_count == running_count
return service.desired_count == running_count and service.primary_deployment.has_completed

def get_running_tasks_count(self, service, task_arns):
running_count = 0
Expand Down
3 changes: 1 addition & 2 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_deploy_with_rollback(get_client, runner):
get_client.return_value = EcsTestClient('acces_key', 'secret_key', wait=2)
result = runner.invoke(cli.deploy, (CLUSTER_NAME, SERVICE_NAME, '--timeout=1', '--rollback'))

assert result.exit_code == 1
assert result.exit_code == 100
assert result.exception
assert u"Deploying based on task definition: test-task:1" in result.output

Expand Down Expand Up @@ -781,7 +781,6 @@ def test_deploy_with_wait_within_timeout(get_client, runner):
result = runner.invoke(cli.deploy, (CLUSTER_NAME, SERVICE_NAME, '--timeout', '10'))
assert result.exit_code == 0
assert u'Deploying new task definition' in result.output
assert u'...' in result.output


@patch('ecs_deploy.cli.get_client')
Expand Down