diff --git a/CHANGELOG.md b/CHANGELOG.md index d9e135d792..46b1444d9d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -59,9 +59,9 @@ The index had low cardinality and workflow pickup is faster without it. Migratio The `IX_METADATA_ENTRY_WEU_MK` index is added to `METADATA_ENTRY`. In pre-release testing, the migration proceeded at about 3 million rows per minute. Please plan downtime accordingly. -### Bug fixes and small changes +### Reduce errors from boot disk filling up on Google Lifesciences API - * Changed default boot disk size from 10GB to 20GB in PipelinesAPI and Google Batch backends + * If Cromwell can't determine the size of the user command Docker image, it will increase Lifesciences API boot disk size by 30GB rather than 0. This should reduce incidence of tasks failing due to boot disk filling up. #### Improved `size()` function performance on arrays diff --git a/centaur/src/main/resources/standardTestCases/docker_size_dockerhub.test b/centaur/src/main/resources/standardTestCases/docker_size_dockerhub.test index f7d59e4b39..dc3d297f6f 100644 --- a/centaur/src/main/resources/standardTestCases/docker_size_dockerhub.test +++ b/centaur/src/main/resources/standardTestCases/docker_size_dockerhub.test @@ -11,8 +11,8 @@ files { metadata { status: Succeeded - "outputs.docker_size_dockerhub.large_dockerhub_image_with_hash.bootDiskSize": 27 - "outputs.docker_size_dockerhub.large_dockerhub_image_with_tag.bootDiskSize": 27 + "outputs.docker_size_dockerhub.large_dockerhub_image_with_hash.bootDiskSize": 17 + "outputs.docker_size_dockerhub.large_dockerhub_image_with_tag.bootDiskSize": 17 } workflowType: WDL diff --git a/centaur/src/main/resources/standardTestCases/docker_size_gcr.test b/centaur/src/main/resources/standardTestCases/docker_size_gcr.test index 2e8c2e1b2d..7399089f30 100644 --- a/centaur/src/main/resources/standardTestCases/docker_size_gcr.test +++ b/centaur/src/main/resources/standardTestCases/docker_size_gcr.test @@ -11,8 +11,8 @@ files { metadata { status: Succeeded - "outputs.docker_size_gcr.large_gcr_image_with_hash.bootDiskSize": 27 - "outputs.docker_size_gcr.large_gcr_image_with_tag.bootDiskSize": 27 + "outputs.docker_size_gcr.large_gcr_image_with_hash.bootDiskSize": 17 + "outputs.docker_size_gcr.large_gcr_image_with_tag.bootDiskSize": 17 } workflowType: WDL diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala index 5c35d53325..7e908dd92a 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala @@ -60,7 +60,7 @@ object GcpBatchRuntimeAttributes { val BootDiskSizeKey = "bootDiskSizeGb" private val bootDiskValidationInstance = new IntRuntimeAttributesValidation(BootDiskSizeKey) - private val BootDiskDefaultValue = WomInteger(20) + private val BootDiskDefaultValue = WomInteger(10) val NoAddressKey = "noAddress" private val noAddressValidationInstance = new BooleanRuntimeAttributesValidation(NoAddressKey) diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiRuntimeAttributes.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiRuntimeAttributes.scala index bc05b525f5..046840b602 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiRuntimeAttributes.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiRuntimeAttributes.scala @@ -66,7 +66,7 @@ object PipelinesApiRuntimeAttributes { val BootDiskSizeKey = "bootDiskSizeGb" private val bootDiskValidationInstance = new IntRuntimeAttributesValidation(BootDiskSizeKey) - private val BootDiskDefaultValue = WomInteger(20) + private val BootDiskDefaultValue = WomInteger(10) val NoAddressKey = "noAddress" private val noAddressValidationInstance = new BooleanRuntimeAttributesValidation(NoAddressKey) diff --git a/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/LifeSciencesFactory.scala b/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/LifeSciencesFactory.scala index 0d4996f63c..bcb373a902 100644 --- a/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/LifeSciencesFactory.scala +++ b/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/LifeSciencesFactory.scala @@ -154,13 +154,17 @@ case class LifeSciencesFactory(applicationName: String, authMode: GoogleAuthMode */ val adjustedBootDiskSize = { val fromRuntimeAttributes = createPipelineParameters.runtimeAttributes.bootDiskSize - // Compute the decompressed size based on the information available - val userCommandImageSizeInBytes = createPipelineParameters.jobDescriptor.dockerSize + + // Compute the decompressed size based on the information available. If we couldn't get the image size, + // default to 30GB. Defaulting to 0 can cause task to run out of disk. (more in AN-300) + val maybeUserCommandImageSizeInGB = createPipelineParameters.jobDescriptor.dockerSize .map(_.toFullSize(DockerConfiguration.instance.sizeCompressionFactor)) - .getOrElse(0L) - val userCommandImageSizeInGB = - MemorySize(userCommandImageSizeInBytes.toDouble, MemoryUnit.Bytes).to(MemoryUnit.GB).amount - val userCommandImageSizeRoundedUpInGB = userCommandImageSizeInGB.ceil.toInt + .map(s => MemorySize(s.toDouble, MemoryUnit.Bytes).to(MemoryUnit.GB)) + val (userCommandImageSizeInGB, userImageLogString) = maybeUserCommandImageSizeInGB match { + case Some(imageSize) => (imageSize, "user command image") + case None => (MemorySize(30, MemoryUnit.GB), "failed to obtain user command image size, using safe default") + } + val userCommandImageSizeRoundedUpInGB = userCommandImageSizeInGB.amount.ceil.toInt val totalSize = fromRuntimeAttributes + createPipelineParameters.dockerImageCacheDiskOpt @@ -171,7 +175,7 @@ case class LifeSciencesFactory(applicationName: String, authMode: GoogleAuthMode if (totalSize != fromRuntimeAttributes) { jobLogger.info( - s"Adjusting boot disk size to $totalSize GB: $fromRuntimeAttributes GB (runtime attributes) + $userCommandImageSizeRoundedUpInGB GB (user command image) + ${ActionUtils.cromwellImagesSizeRoundedUpInGB} GB (Cromwell support images)" + s"Adjusting boot disk size to $totalSize GB: $fromRuntimeAttributes GB (runtime attributes) + $userCommandImageSizeRoundedUpInGB GB ($userImageLogString) + ${ActionUtils.cromwellImagesSizeRoundedUpInGB} GB (Cromwell support images)" ) }