Skip to content

Commit

Permalink
Optional retry with more memory for all standard backends.
Browse files Browse the repository at this point in the history
  • Loading branch information
kshakir committed May 11, 2024
1 parent 61063cb commit 0477642
Show file tree
Hide file tree
Showing 12 changed files with 134 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -237,13 +237,16 @@ trait StandardAsyncExecutionActor
lazy val commandDirectory: Path = jobPaths.callExecutionRoot

lazy val memoryRetryErrorKeys: Option[List[String]] =
configurationDescriptor.globalConfig.as[Option[List[String]]]("system.memory-retry-error-keys")
configurationDescriptor.globalConfig.getAs[List[String]]("system.memory-retry-error-keys")

lazy val memoryRetryStderrLimit: Option[Int] =
configurationDescriptor.globalConfig.getAs[Int]("system.memory-retry-stderr-limit")

lazy val memoryRetryFactor: Option[MemoryRetryMultiplierRefined] =
jobDescriptor.workflowDescriptor.getWorkflowOption(WorkflowOptions.MemoryRetryMultiplier) flatMap { value: String =>
Try(value.toDouble) match {
case Success(v) =>
refineV[MemoryRetryMultiplier](v.toDouble) match {
refineV[MemoryRetryMultiplier](v) match {
case Left(e) =>
// should not happen, this case should have been screened for and fast-failed during workflow materialization.
log.error(
Expand Down Expand Up @@ -1154,7 +1157,7 @@ trait StandardAsyncExecutionActor
val nextKvJobKey =
KvJobKey(jobDescriptor.key.call.fullyQualifiedName, jobDescriptor.key.index, jobDescriptor.key.attempt + 1)

def getNextKvPair[A](key: String, value: String): Map[String, KvPair] = {
def getNextKvPair(key: String, value: String): Map[String, KvPair] = {
val nextScopedKey = ScopedKey(jobDescriptor.workflowDescriptor.id, nextKvJobKey, key)
val nextKvPair = KvPair(nextScopedKey, value)
Map(key -> nextKvPair)
Expand Down Expand Up @@ -1359,36 +1362,44 @@ trait StandardAsyncExecutionActor

// Returns true if the task has written an RC file that indicates OOM, false otherwise
def memoryRetryRC: Future[Boolean] = {
def returnCodeAsBoolean(codeAsOption: Option[String]): Boolean =
codeAsOption match {
case Some(codeAsString) =>
Try(codeAsString.trim.toInt) match {
case Success(code) =>
code match {
case StderrContainsRetryKeysCode => true
case _ => false
}
case Failure(e) =>
log.error(
s"'CheckingForMemoryRetry' action exited with code '$codeAsString' which couldn't be " +
s"converted to an Integer. Task will not be retried with more memory. Error: ${ExceptionUtils.getMessage(e)}"
)
false
}
case None => false

def readFile(path: Path, maxBytes: Option[Int]): Future[String] =
asyncIo.contentAsStringAsync(path, maxBytes, failOnOverflow = false)

def checkMemoryRetryRC(): Future[Boolean] =
readFile(jobPaths.memoryRetryRC, None) map { codeAsString =>
Try(codeAsString.trim.toInt) match {
case Success(code) =>
code match {
case StderrContainsRetryKeysCode => true
case _ => false

Check warning on line 1375 in backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala

View check run for this annotation

Codecov / codecov/patch

backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala#L1375

Added line #L1375 was not covered by tests
}
case Failure(e) =>
log.error(
s"'CheckingForMemoryRetry' action exited with code '$codeAsString' which couldn't be " +

Check warning on line 1379 in backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala

View check run for this annotation

Codecov / codecov/patch

backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala#L1377-L1379

Added lines #L1377 - L1379 were not covered by tests
s"converted to an Integer. Task will not be retried with more memory. Error: ${ExceptionUtils.getMessage(e)}"
)
false

Check warning on line 1382 in backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala

View check run for this annotation

Codecov / codecov/patch

backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala#L1382

Added line #L1382 was not covered by tests
}
}

def readMemoryRetryRCFile(fileExists: Boolean): Future[Option[String]] =
if (fileExists)
asyncIo.contentAsStringAsync(jobPaths.memoryRetryRC, None, failOnOverflow = false).map(Option(_))
else
Future.successful(None)
def checkMemoryRetryStderr(errorKeys: List[String], maxBytes: Int): Future[Boolean] =
readFile(jobPaths.standardPaths.error, Option(maxBytes)) map { errorContent =>
errorKeys.exists(errorContent.contains)
}

for {
fileExists <- asyncIo.existsAsync(jobPaths.memoryRetryRC)
retryCheckRCAsOption <- readMemoryRetryRCFile(fileExists)
retryWithMoreMemory = returnCodeAsBoolean(retryCheckRCAsOption)
} yield retryWithMoreMemory
asyncIo.existsAsync(jobPaths.memoryRetryRC) flatMap {
case true => checkMemoryRetryRC()
case false =>
(memoryRetryErrorKeys, memoryRetryStderrLimit) match {
case (Some(keys), Some(limit)) =>
asyncIo.existsAsync(jobPaths.standardPaths.error) flatMap {
case true => checkMemoryRetryStderr(keys, limit)
case false => Future.successful(false)
}
case _ => Future.successful(false)

Check warning on line 1400 in backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala

View check run for this annotation

Codecov / codecov/patch

backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala#L1400

Added line #L1400 was not covered by tests
}
}
}

val stderr = jobPaths.standardPaths.error
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name: do_not_retry_rc0_tes
testFormat: workflowsuccess
backends: [TES]

files {
workflow: retry_with_more_memory/do_not_retry_rc0.wdl
options: retry_with_more_memory/retry_with_more_memory.options
}

metadata {
workflowName: do_not_retry_rc0
status: Succeeded
"calls.do_not_retry_rc0.imitate_oom_error.executionStatus": "Done"
"calls.do_not_retry_rc0.imitate_oom_error.runtimeAttributes.memory": "1 GB"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name: do_not_retry_rc1_tes
testFormat: workflowsuccess
backends: [TES]

files {
workflow: retry_with_more_memory/do_not_retry_rc1.wdl
options: retry_with_more_memory/retry_with_more_memory.options
}

metadata {
workflowName: do_not_retry_rc1
status: Succeeded
"calls.do_not_retry_rc1.imitate_oom_error.executionStatus": "Done"
"calls.do_not_retry_rc1.imitate_oom_error.runtimeAttributes.memory": "1 GB"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: retry_same_memory_output_failure_tes
testFormat: workflowfailure
backends: [TES]

files {
workflow: retry_with_more_memory/retry_same_memory_output_failure.wdl
options: retry_with_more_memory/retry_with_more_memory.options
}

metadata {
workflowName: retry_same_memory_output_failure
status: Failed
"failures.0.message": "Workflow failed"
"failures.0.causedBy.0.message": ~~"Task retry_same_memory_output_failure.imitate_oom_error:NA:3 failed with SYSTEM_ERROR"
"retry_same_memory_output_failure.imitate_oom_error.-1.1.executionStatus": "RetryableFailure"
"retry_same_memory_output_failure.imitate_oom_error.-1.1.runtimeAttributes.memory": "1 GB"
"retry_same_memory_output_failure.imitate_oom_error.-1.2.executionStatus": "RetryableFailure"
"retry_same_memory_output_failure.imitate_oom_error.-1.2.runtimeAttributes.memory": "1 GB"
"retry_same_memory_output_failure.imitate_oom_error.-1.3.executionStatus": "Failed"
"retry_same_memory_output_failure.imitate_oom_error.-1.3.runtimeAttributes.memory": "1 GB"
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ task imitate_oom_error {
docker: "python:latest"
memory: "1 GB"
maxRetries: 2
backend: "Papiv2"
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ task imitate_oom_error {
memory: "1 GB"
continueOnReturnCode: true
maxRetries: 2
backend: "Papiv2"
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ task imitate_oom_error {
memory: "1 GB"
continueOnReturnCode: true
maxRetries: 2
backend: "Papiv2"
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ task imitate_oom_error {
docker: "python:latest"
memory: "1 GB"
maxRetries: 2
backend: "Papiv2"
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: retry_with_more_memory_no_wf_option_tes
testFormat: workflowfailure
backends: [TES]

files {
workflow: retry_with_more_memory/retry_with_more_memory.wdl
}

metadata {
workflowName: retry_with_more_memory
status: Failed
"failures.0.message": "Workflow failed"
"retry_with_more_memory.imitate_oom_error.-1.1.executionStatus": "RetryableFailure"
"retry_with_more_memory.imitate_oom_error.-1.1.runtimeAttributes.memory": "1 GB"
"retry_with_more_memory.imitate_oom_error.-1.2.executionStatus": "RetryableFailure"
"retry_with_more_memory.imitate_oom_error.-1.2.runtimeAttributes.memory": "1 GB"
"retry_with_more_memory.imitate_oom_error.-1.3.executionStatus": "Failed"
"retry_with_more_memory.imitate_oom_error.-1.3.runtimeAttributes.memory": "1 GB"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: retry_with_more_memory_tes
testFormat: workflowfailure
backends: [TES]

files {
workflow: retry_with_more_memory/retry_with_more_memory.wdl
options: retry_with_more_memory/retry_with_more_memory.options
}

metadata {
workflowName: retry_with_more_memory
status: Failed
"failures.0.message": "Workflow failed"
"failures.0.causedBy.0.message": "stderr for job `retry_with_more_memory.imitate_oom_error:NA:3` contained one of the `memory-retry-error-keys: [OutOfMemory,Killed]` specified in the Cromwell config. Job might have run out of memory."
"retry_with_more_memory.imitate_oom_error.-1.1.executionStatus": "RetryableFailure"
"retry_with_more_memory.imitate_oom_error.-1.1.runtimeAttributes.memory": "1 GB"
"retry_with_more_memory.imitate_oom_error.-1.2.executionStatus": "RetryableFailure"
"retry_with_more_memory.imitate_oom_error.-1.2.runtimeAttributes.memory": "1.1 GB"
"retry_with_more_memory.imitate_oom_error.-1.3.executionStatus": "Failed"
"retry_with_more_memory.imitate_oom_error.-1.3.runtimeAttributes.memory": "1.2100000000000002 GB"
}
1 change: 1 addition & 0 deletions core/src/main/resources/reference.conf
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ system {
dns-cache-ttl: 3 minutes

memory-retry-error-keys = ["OutOfMemory", "Killed"]
memory-retry-stderr-limit = 128000
}

workflow-options {
Expand Down
2 changes: 1 addition & 1 deletion docs/cromwell_features/RetryWithMoreMemory.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ string present in `system.memory-retry-error-keys`. Similarly, if the runtime at
specified as a true, or the return code of the task matches a value specified by `continueOnReturnCode`, the task
will be considered successful and will not be retried with more memory.

Please note that this feature currently only works in Google Cloud backend. Also, Pipelines API might adjust the
Please note that backends such as the Pipelines API might adjust the
memory value based on their standards for memory for a VM. So it's possible that even though the request says 1.1 GB
memory, it actually allocated a bit more memory to the VM.

Expand Down

0 comments on commit 0477642

Please sign in to comment.