From e80cef35fc7c32d281cbd916c13bb689333d7481 Mon Sep 17 00:00:00 2001 From: Sam Schumacher <42777208+sam-schu@users.noreply.github.com> Date: Thu, 19 Dec 2024 16:41:31 -0500 Subject: [PATCH] Fetch and look up GPU SKUs --- .../services/cost/GcpCostCatalogService.scala | 83 +++++++++++++++---- .../services/cost/GcpCostCatalogTypes.scala | 49 ++++++++--- 2 files changed, 103 insertions(+), 29 deletions(-) diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala index 81b228b430..504ee067ce 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala @@ -7,6 +7,7 @@ import com.google.cloud.billing.v1._ import com.typesafe.config.Config import com.typesafe.scalalogging.LazyLogging import common.util.StringUtil.EnhancedToStringable +import common.validation.ErrorOr import common.validation.ErrorOr._ import common.validation.ErrorOr.ErrorOr import cromwell.services.ServiceRegistryActor.ServiceRegistryMessage @@ -18,9 +19,9 @@ import scala.jdk.CollectionConverters.IterableHasAsScala import java.time.temporal.ChronoUnit.SECONDS import scala.util.Using -case class CostCatalogKey(machineType: MachineType, +case class CostCatalogKey(resourceInfo: ResourceInfo, usageType: UsageType, - machineCustomization: MachineCustomization, + machineCustomization: Option[MachineCustomization], resourceType: ResourceType, region: String ) @@ -38,28 +39,47 @@ object CostCatalogKey { final val expectedSku = (".*?N1 Predefined Instance (Core|Ram) .*|" + ".*?N2 Custom Instance (Core|Ram) .*|" + - ".*?N2D AMD Custom Instance (Core|Ram) .*").r + ".*?N2D AMD Custom Instance (Core|Ram) .*|" + + "Nvidia Tesla V100 GPU .*|" + + "Nvidia Tesla P100 GPU .*|" + + "Nvidia Tesla P4 GPU .*|" + + "Nvidia Tesla T4 GPU .*").r + // TODO: seems like it will probably still match GPU strings with extra stuff in front - + // it just won't take any of those preceding characters + // What is the point of the .*? ?? def apply(sku: Sku): List[CostCatalogKey] = for { _ <- expectedSku.findFirstIn(sku.getDescription).toList - machineType <- MachineType.fromSku(sku).toList + resourceInfo <- ResourceInfo.fromSku(sku).toList resourceType <- ResourceType.fromSku(sku).toList usageType <- UsageType.fromSku(sku).toList - machineCustomization <- MachineCustomization.fromSku(sku).toList region <- sku.getServiceRegionsList.asScala.toList - } yield CostCatalogKey(machineType, usageType, machineCustomization, resourceType, region) + machineCustomization = if (resourceType == Gpu) None else Some(MachineCustomization.fromCpuOrRamSku(sku)) + } yield CostCatalogKey(resourceInfo, usageType, machineCustomization, resourceType, region) def apply(instantiatedVmInfo: InstantiatedVmInfo, resourceType: ResourceType): ErrorOr[CostCatalogKey] = - MachineType.fromGoogleMachineTypeString(instantiatedVmInfo.machineType).map { mType => - CostCatalogKey( - mType, + if (resourceType == Gpu) + for { + gpuInfo <- ErrorOr(instantiatedVmInfo.gpuInfo.get) // TODO: improve error message (default: "None.get") + gpuType <- GpuType.fromGpuInfo(gpuInfo) + } yield CostCatalogKey( + gpuType, UsageType.fromBoolean(instantiatedVmInfo.preemptible), - MachineCustomization.fromMachineTypeString(instantiatedVmInfo.machineType), - resourceType, + None, + Gpu, instantiatedVmInfo.region ) - } + else + MachineType.fromGoogleMachineTypeString(instantiatedVmInfo.machineType).map { mType => + CostCatalogKey( + mType, + UsageType.fromBoolean(instantiatedVmInfo.preemptible), + Some(MachineCustomization.fromMachineTypeString(instantiatedVmInfo.machineType)), + resourceType, + instantiatedVmInfo.region + ) + } } case class GcpCostLookupRequest(vmInfo: InstantiatedVmInfo, replyTo: ActorRef) extends ServiceRegistryMessage { @@ -116,6 +136,9 @@ object GcpCostCatalogService { s"Expected usage units of RAM to be 'GiBy.h'. Got ${usageUnit}".invalidNel } } + + // TODO: implement this + def calculateGpuPricePerHour(gpuSku: Sku, gpuCount: Long): ErrorOr[BigDecimal] = BigDecimal(1).validNel } /** @@ -200,8 +223,8 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service // As of Sept 2024 the cost catalog does not contain entries for custom N1 machines. If we're using N1, attempt // to fall back to predefined. lazy val n1PredefinedKey = - (key.machineType, key.machineCustomization) match { - case (N1, Custom) => Option(key.copy(machineCustomization = Predefined)) + (key.resourceInfo, key.machineCustomization) match { + case (N1, Some(Custom)) => Option(key.copy(machineCustomization = Some(Predefined))) case _ => None } val sku = getSku(key).orElse(n1PredefinedKey.flatMap(getSku)).map(_.catalogObject) @@ -212,23 +235,47 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service } // TODO consider caching this, answers won't change until we reload the SKUs - def calculateVmCostPerHour(instantiatedVmInfo: InstantiatedVmInfo): ErrorOr[BigDecimal] = - for { + def calculateVmCostPerHour(instantiatedVmInfo: InstantiatedVmInfo): ErrorOr[BigDecimal] = { + val cpuPricingInfoErrorOr = for { cpuSku <- lookUpSku(instantiatedVmInfo, Cpu) coreCount <- MachineType.extractCoreCountFromMachineTypeString(instantiatedVmInfo.machineType) cpuPricePerHour <- GcpCostCatalogService.calculateCpuPricePerHour(cpuSku, coreCount) + } yield (cpuSku, coreCount, cpuPricePerHour) + + val ramPricingInfoErrorOr = for { ramSku <- lookUpSku(instantiatedVmInfo, Ram) ramMbCount <- MachineType.extractRamMbFromMachineTypeString(instantiatedVmInfo.machineType) ramGbCount = ramMbCount / 1024d // need sub-integer resolution ramPricePerHour <- GcpCostCatalogService.calculateRamPricePerHour(ramSku, ramGbCount) - totalCost = cpuPricePerHour + ramPricePerHour + } yield (ramSku, ramGbCount, ramPricePerHour) + + val gpuPricingInfoErrorOr = instantiatedVmInfo.gpuInfo match { + case None => (None, 0, BigDecimal(0)).validNel + case Some(gpuInfo) => + for { + gpuSku <- lookUpSku(instantiatedVmInfo, Gpu) + gpuCount = gpuInfo.count + gpuPricePerHour <- GcpCostCatalogService.calculateGpuPricePerHour(gpuSku, gpuCount) + } yield (Some(gpuSku), gpuCount, gpuPricePerHour) + } + + for { + cpuPricingInfo <- cpuPricingInfoErrorOr + (cpuSku, coreCount, cpuPricePerHour) = cpuPricingInfo + ramPricingInfo <- ramPricingInfoErrorOr + (ramSku, ramGbCount, ramPricePerHour) = ramPricingInfo + gpuPricingInfo <- gpuPricingInfoErrorOr + (gpuSku, gpuCount, gpuPricePerHour) = gpuPricingInfo + totalCost = cpuPricePerHour + ramPricePerHour + gpuPricePerHour _ = logger.info( s"Calculated vmCostPerHour of ${totalCost} " + s"(CPU ${cpuPricePerHour} for ${coreCount} cores [${cpuSku.getDescription}], " + - s"RAM ${ramPricePerHour} for ${ramGbCount} Gb [${ramSku.getDescription}]) " + + s"RAM ${ramPricePerHour} for ${ramGbCount} Gb [${ramSku.getDescription}], " + + s"GPU ${gpuPricePerHour} for ${gpuCount} GPUs [${gpuSku.map(_.getDescription)}]) " + s"for ${instantiatedVmInfo}" ) } yield totalCost + } def serviceRegistryActor: ActorRef = serviceRegistry override def receive: Receive = { diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala index eea4c54247..9a03c258a1 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala @@ -17,20 +17,28 @@ case class InstantiatedVmInfo(region: String, machineType: String, gpuInfo: Opti * These types reflect hardcoded strings found in a google cost catalog. */ -sealed trait MachineType { def machineTypeName: String } -case object N1 extends MachineType { override val machineTypeName = "n1" } -case object N2 extends MachineType { override val machineTypeName = "n2" } -case object N2d extends MachineType { override val machineTypeName = "n2d" } +sealed trait ResourceInfo -object MachineType { - def fromSku(sku: Sku): Option[MachineType] = { +object ResourceInfo { + def fromSku(sku: Sku): Option[ResourceInfo] = { val tokenizedDescription = sku.getDescription.toLowerCase.split(" ") if (tokenizedDescription.contains(N1.machineTypeName)) Some(N1) else if (tokenizedDescription.contains(N2.machineTypeName)) Some(N2) else if (tokenizedDescription.contains(N2d.machineTypeName)) Some(N2d) + else if (tokenizedDescription.contains(NvidiaTeslaV100.gpuTypeName)) Some(NvidiaTeslaV100) + else if (tokenizedDescription.contains(NvidiaTeslaP100.gpuTypeName)) Some(NvidiaTeslaP100) + else if (tokenizedDescription.contains(NvidiaTeslaP4.gpuTypeName)) Some(NvidiaTeslaP4) + else if (tokenizedDescription.contains(NvidiaTeslaT4.gpuTypeName)) Some(NvidiaTeslaT4) else Option.empty } +} + +sealed trait MachineType extends ResourceInfo { def machineTypeName: String } +case object N1 extends MachineType { override val machineTypeName = "n1" } +case object N2 extends MachineType { override val machineTypeName = "n2" } +case object N2d extends MachineType { override val machineTypeName = "n2d" } +object MachineType { // expects a string that looks something like "n1-standard-1" or "custom-1-4096" def fromGoogleMachineTypeString(machineTypeString: String): ErrorOr[MachineType] = { val mType = machineTypeString.toLowerCase @@ -63,6 +71,24 @@ object MachineType { } } +sealed trait GpuType extends ResourceInfo { def gpuTypeName: String } +case object NvidiaTeslaV100 extends GpuType { override val gpuTypeName = "v100" } +case object NvidiaTeslaP100 extends GpuType { override val gpuTypeName = "p100" } +case object NvidiaTeslaP4 extends GpuType { override val gpuTypeName = "p4" } +case object NvidiaTeslaT4 extends GpuType { override val gpuTypeName = "t4" } + +object GpuType { + // expects GpuInfo with a GPU type that looks something like "nvidia-tesla-v100" + def fromGpuInfo(gpuInfo: GpuInfo): ErrorOr[GpuType] = { + val gpuType = gpuInfo.gpuType.toLowerCase + if (gpuType.endsWith("-v100")) NvidiaTeslaV100.validNel + else if (gpuType.endsWith("-p100")) NvidiaTeslaP100.validNel + else if (gpuType.endsWith("-p4")) NvidiaTeslaP4.validNel + else if (gpuType.endsWith("-t4")) NvidiaTeslaT4.validNel + else s"Unrecognized GPU type: $gpuType".invalidNel + } +} + sealed trait UsageType { def typeName: String } case object OnDemand extends UsageType { override val typeName = "ondemand" } case object Preemptible extends UsageType { override val typeName = "preemptible" } @@ -78,7 +104,6 @@ object UsageType { case true => Preemptible case false => OnDemand } - } sealed trait MachineCustomization { def customizationName: String } @@ -96,21 +121,22 @@ object MachineCustomization { - For non-N1 machines, both custom and predefined SKUs are included, custom ones include "Custom" in their description strings and predefined SKUs are only identifiable by the absence of "Custom." */ - def fromSku(sku: Sku): Option[MachineCustomization] = { + def fromCpuOrRamSku(sku: Sku): MachineCustomization = { val tokenizedDescription = sku.getDescription.toLowerCase.split(" ") // ex. "N1 Predefined Instance Core running in Montreal" - if (tokenizedDescription.contains(Predefined.customizationName)) Some(Predefined) + if (tokenizedDescription.contains(Predefined.customizationName)) Predefined // ex. "N2 Custom Instance Core running in Paris" - else if (tokenizedDescription.contains(Custom.customizationName)) Some(Custom) + else if (tokenizedDescription.contains(Custom.customizationName)) Custom // ex. "N2 Instance Core running in Paris" - else Some(Predefined) + else Predefined } } sealed trait ResourceType { def groupName: String } case object Cpu extends ResourceType { override val groupName = "cpu" } case object Ram extends ResourceType { override val groupName = "ram" } +case object Gpu extends ResourceType { override val groupName = "gpu" } object ResourceType { def fromSku(sku: Sku): Option[ResourceType] = { @@ -118,6 +144,7 @@ object ResourceType { sku.getCategory.getResourceGroup.toLowerCase match { case Cpu.groupName => Some(Cpu) case Ram.groupName => Some(Ram) + case Gpu.groupName => Some(Gpu) case "n1standard" if tokenizedDescription.contains("ram") => Some(Ram) case "n1standard" if tokenizedDescription.contains("core") => Some(Cpu) case _ => Option.empty