-
Notifications
You must be signed in to change notification settings - Fork 359
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AN-144] Cost capping GPU support #7672
base: develop
Are you sure you want to change the base?
Changes from all commits
5fd48a2
04c253b
a14e823
3d7ac06
473fdec
f0924b8
7233923
02437d2
e80cef3
ddc2f22
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
import com.typesafe.config.Config | ||
import com.typesafe.scalalogging.LazyLogging | ||
import common.util.StringUtil.EnhancedToStringable | ||
import common.validation.ErrorOr | ||
import common.validation.ErrorOr._ | ||
import common.validation.ErrorOr.ErrorOr | ||
import cromwell.services.ServiceRegistryActor.ServiceRegistryMessage | ||
|
@@ -18,9 +19,9 @@ | |
import java.time.temporal.ChronoUnit.SECONDS | ||
import scala.util.Using | ||
|
||
case class CostCatalogKey(machineType: MachineType, | ||
case class CostCatalogKey(resourceInfo: ResourceInfo, | ||
usageType: UsageType, | ||
machineCustomization: MachineCustomization, | ||
machineCustomization: Option[MachineCustomization], | ||
resourceType: ResourceType, | ||
region: String | ||
) | ||
|
@@ -38,28 +39,47 @@ | |
final val expectedSku = | ||
(".*?N1 Predefined Instance (Core|Ram) .*|" + | ||
".*?N2 Custom Instance (Core|Ram) .*|" + | ||
".*?N2D AMD Custom Instance (Core|Ram) .*").r | ||
".*?N2D AMD Custom Instance (Core|Ram) .*|" + | ||
"Nvidia Tesla V100 GPU .*|" + | ||
"Nvidia Tesla P100 GPU .*|" + | ||
"Nvidia Tesla P4 GPU .*|" + | ||
"Nvidia Tesla T4 GPU .*").r | ||
// TODO: seems like it will probably still match GPU strings with extra stuff in front - | ||
// it just won't take any of those preceding characters | ||
// What is the point of the .*? ?? | ||
|
||
def apply(sku: Sku): List[CostCatalogKey] = | ||
for { | ||
_ <- expectedSku.findFirstIn(sku.getDescription).toList | ||
machineType <- MachineType.fromSku(sku).toList | ||
resourceInfo <- ResourceInfo.fromSku(sku).toList | ||
resourceType <- ResourceType.fromSku(sku).toList | ||
usageType <- UsageType.fromSku(sku).toList | ||
machineCustomization <- MachineCustomization.fromSku(sku).toList | ||
region <- sku.getServiceRegionsList.asScala.toList | ||
} yield CostCatalogKey(machineType, usageType, machineCustomization, resourceType, region) | ||
machineCustomization = if (resourceType == Gpu) None else Some(MachineCustomization.fromCpuOrRamSku(sku)) | ||
} yield CostCatalogKey(resourceInfo, usageType, machineCustomization, resourceType, region) | ||
|
||
def apply(instantiatedVmInfo: InstantiatedVmInfo, resourceType: ResourceType): ErrorOr[CostCatalogKey] = | ||
MachineType.fromGoogleMachineTypeString(instantiatedVmInfo.machineType).map { mType => | ||
CostCatalogKey( | ||
mType, | ||
if (resourceType == Gpu) | ||
for { | ||
gpuInfo <- ErrorOr(instantiatedVmInfo.gpuInfo.get) // TODO: improve error message (default: "None.get") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently, if this method is called with instantiated VM info that does not include GPUs, this line will fail with a very cryptic error message. It should fail — we can't get a GPU cost catalog key for a VM that doesn't use a GPU — but we should improve the error message. |
||
gpuType <- GpuType.fromGpuInfo(gpuInfo) | ||
} yield CostCatalogKey( | ||
gpuType, | ||
UsageType.fromBoolean(instantiatedVmInfo.preemptible), | ||
MachineCustomization.fromMachineTypeString(instantiatedVmInfo.machineType), | ||
resourceType, | ||
None, | ||
Gpu, | ||
instantiatedVmInfo.region | ||
) | ||
} | ||
else | ||
MachineType.fromGoogleMachineTypeString(instantiatedVmInfo.machineType).map { mType => | ||
CostCatalogKey( | ||
mType, | ||
UsageType.fromBoolean(instantiatedVmInfo.preemptible), | ||
Some(MachineCustomization.fromMachineTypeString(instantiatedVmInfo.machineType)), | ||
resourceType, | ||
instantiatedVmInfo.region | ||
) | ||
} | ||
} | ||
|
||
case class GcpCostLookupRequest(vmInfo: InstantiatedVmInfo, replyTo: ActorRef) extends ServiceRegistryMessage { | ||
|
@@ -116,6 +136,9 @@ | |
s"Expected usage units of RAM to be 'GiBy.h'. Got ${usageUnit}".invalidNel | ||
} | ||
} | ||
|
||
// TODO: implement this | ||
def calculateGpuPricePerHour(gpuSku: Sku, gpuCount: Long): ErrorOr[BigDecimal] = BigDecimal(1).validNel | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did not have time to dig into how |
||
} | ||
|
||
/** | ||
|
@@ -200,8 +223,8 @@ | |
// As of Sept 2024 the cost catalog does not contain entries for custom N1 machines. If we're using N1, attempt | ||
// to fall back to predefined. | ||
lazy val n1PredefinedKey = | ||
(key.machineType, key.machineCustomization) match { | ||
case (N1, Custom) => Option(key.copy(machineCustomization = Predefined)) | ||
(key.resourceInfo, key.machineCustomization) match { | ||
case (N1, Some(Custom)) => Option(key.copy(machineCustomization = Some(Predefined))) | ||
case _ => None | ||
} | ||
val sku = getSku(key).orElse(n1PredefinedKey.flatMap(getSku)).map(_.catalogObject) | ||
|
@@ -212,23 +235,47 @@ | |
} | ||
|
||
// TODO consider caching this, answers won't change until we reload the SKUs | ||
def calculateVmCostPerHour(instantiatedVmInfo: InstantiatedVmInfo): ErrorOr[BigDecimal] = | ||
for { | ||
def calculateVmCostPerHour(instantiatedVmInfo: InstantiatedVmInfo): ErrorOr[BigDecimal] = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This method had to be significantly restructured due to the fact that, unlike with CPUs and RAM, we need to be able to skip the GPU SKU lookup and cost calculation steps for a VM that does not use GPUs. Note that we can't just try to look up the SKU anyway and give the GPU cost as 0 if there is an error if we want to be able to distinguish between not having anything to look up and having a genuine error because, for example, the GPU type is not recognized. To the best of my knowledge, this necessitated breaking up the large |
||
val cpuPricingInfoErrorOr = for { | ||
cpuSku <- lookUpSku(instantiatedVmInfo, Cpu) | ||
coreCount <- MachineType.extractCoreCountFromMachineTypeString(instantiatedVmInfo.machineType) | ||
cpuPricePerHour <- GcpCostCatalogService.calculateCpuPricePerHour(cpuSku, coreCount) | ||
} yield (cpuSku, coreCount, cpuPricePerHour) | ||
|
||
val ramPricingInfoErrorOr = for { | ||
ramSku <- lookUpSku(instantiatedVmInfo, Ram) | ||
ramMbCount <- MachineType.extractRamMbFromMachineTypeString(instantiatedVmInfo.machineType) | ||
ramGbCount = ramMbCount / 1024d // need sub-integer resolution | ||
ramPricePerHour <- GcpCostCatalogService.calculateRamPricePerHour(ramSku, ramGbCount) | ||
totalCost = cpuPricePerHour + ramPricePerHour | ||
} yield (ramSku, ramGbCount, ramPricePerHour) | ||
|
||
val gpuPricingInfoErrorOr = instantiatedVmInfo.gpuInfo match { | ||
case None => (None, 0, BigDecimal(0)).validNel | ||
case Some(gpuInfo) => | ||
for { | ||
gpuSku <- lookUpSku(instantiatedVmInfo, Gpu) | ||
gpuCount = gpuInfo.count | ||
gpuPricePerHour <- GcpCostCatalogService.calculateGpuPricePerHour(gpuSku, gpuCount) | ||
} yield (Some(gpuSku), gpuCount, gpuPricePerHour) | ||
} | ||
|
||
for { | ||
cpuPricingInfo <- cpuPricingInfoErrorOr | ||
(cpuSku, coreCount, cpuPricePerHour) = cpuPricingInfo | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The downside of breaking up this method into smaller |
||
ramPricingInfo <- ramPricingInfoErrorOr | ||
(ramSku, ramGbCount, ramPricePerHour) = ramPricingInfo | ||
gpuPricingInfo <- gpuPricingInfoErrorOr | ||
(gpuSku, gpuCount, gpuPricePerHour) = gpuPricingInfo | ||
totalCost = cpuPricePerHour + ramPricePerHour + gpuPricePerHour | ||
_ = logger.info( | ||
s"Calculated vmCostPerHour of ${totalCost} " + | ||
s"(CPU ${cpuPricePerHour} for ${coreCount} cores [${cpuSku.getDescription}], " + | ||
s"RAM ${ramPricePerHour} for ${ramGbCount} Gb [${ramSku.getDescription}]) " + | ||
s"RAM ${ramPricePerHour} for ${ramGbCount} Gb [${ramSku.getDescription}], " + | ||
s"GPU ${gpuPricePerHour} for ${gpuCount} GPUs [${gpuSku.map(_.getDescription)}]) " + | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unlike with the CPU and RAM SKUs, I sent back the GPU SKU from the GPU cost |
||
s"for ${instantiatedVmInfo}" | ||
) | ||
} yield totalCost | ||
} | ||
|
||
def serviceRegistryActor: ActorRef = serviceRegistry | ||
override def receive: Receive = { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,29 +6,39 @@ | |
|
||
import java.util.regex.{Matcher, Pattern} | ||
|
||
case class GpuInfo(count: Long, gpuType: String) | ||
|
||
/* | ||
* Case class that contains information retrieved from Google about a VM that cromwell has started | ||
*/ | ||
case class InstantiatedVmInfo(region: String, machineType: String, preemptible: Boolean) | ||
case class InstantiatedVmInfo(region: String, machineType: String, gpuInfo: Option[GpuInfo], preemptible: Boolean) | ||
|
||
/* | ||
* These types reflect hardcoded strings found in a google cost catalog. | ||
*/ | ||
|
||
sealed trait MachineType { def machineTypeName: String } | ||
case object N1 extends MachineType { override val machineTypeName = "n1" } | ||
case object N2 extends MachineType { override val machineTypeName = "n2" } | ||
case object N2d extends MachineType { override val machineTypeName = "n2d" } | ||
sealed trait ResourceInfo | ||
|
||
object MachineType { | ||
def fromSku(sku: Sku): Option[MachineType] = { | ||
object ResourceInfo { | ||
def fromSku(sku: Sku): Option[ResourceInfo] = { | ||
val tokenizedDescription = sku.getDescription.toLowerCase.split(" ") | ||
if (tokenizedDescription.contains(N1.machineTypeName)) Some(N1) | ||
else if (tokenizedDescription.contains(N2.machineTypeName)) Some(N2) | ||
else if (tokenizedDescription.contains(N2d.machineTypeName)) Some(N2d) | ||
else if (tokenizedDescription.contains(NvidiaTeslaV100.gpuTypeName)) Some(NvidiaTeslaV100) | ||
else if (tokenizedDescription.contains(NvidiaTeslaP100.gpuTypeName)) Some(NvidiaTeslaP100) | ||
else if (tokenizedDescription.contains(NvidiaTeslaP4.gpuTypeName)) Some(NvidiaTeslaP4) | ||
else if (tokenizedDescription.contains(NvidiaTeslaT4.gpuTypeName)) Some(NvidiaTeslaT4) | ||
else Option.empty | ||
} | ||
} | ||
|
||
sealed trait MachineType extends ResourceInfo { def machineTypeName: String } | ||
case object N1 extends MachineType { override val machineTypeName = "n1" } | ||
case object N2 extends MachineType { override val machineTypeName = "n2" } | ||
case object N2d extends MachineType { override val machineTypeName = "n2d" } | ||
|
||
object MachineType { | ||
// expects a string that looks something like "n1-standard-1" or "custom-1-4096" | ||
def fromGoogleMachineTypeString(machineTypeString: String): ErrorOr[MachineType] = { | ||
val mType = machineTypeString.toLowerCase | ||
|
@@ -61,6 +71,24 @@ | |
} | ||
} | ||
|
||
sealed trait GpuType extends ResourceInfo { def gpuTypeName: String } | ||
case object NvidiaTeslaV100 extends GpuType { override val gpuTypeName = "v100" } | ||
case object NvidiaTeslaP100 extends GpuType { override val gpuTypeName = "p100" } | ||
case object NvidiaTeslaP4 extends GpuType { override val gpuTypeName = "p4" } | ||
case object NvidiaTeslaT4 extends GpuType { override val gpuTypeName = "t4" } | ||
|
||
object GpuType { | ||
// expects GpuInfo with a GPU type that looks something like "nvidia-tesla-v100" | ||
def fromGpuInfo(gpuInfo: GpuInfo): ErrorOr[GpuType] = { | ||
val gpuType = gpuInfo.gpuType.toLowerCase | ||
if (gpuType.endsWith("-v100")) NvidiaTeslaV100.validNel | ||
else if (gpuType.endsWith("-p100")) NvidiaTeslaP100.validNel | ||
else if (gpuType.endsWith("-p4")) NvidiaTeslaP4.validNel | ||
else if (gpuType.endsWith("-t4")) NvidiaTeslaT4.validNel | ||
else s"Unrecognized GPU type: $gpuType".invalidNel | ||
} | ||
} | ||
|
||
sealed trait UsageType { def typeName: String } | ||
case object OnDemand extends UsageType { override val typeName = "ondemand" } | ||
case object Preemptible extends UsageType { override val typeName = "preemptible" } | ||
|
@@ -76,7 +104,6 @@ | |
case true => Preemptible | ||
case false => OnDemand | ||
} | ||
|
||
} | ||
|
||
sealed trait MachineCustomization { def customizationName: String } | ||
|
@@ -94,28 +121,30 @@ | |
- For non-N1 machines, both custom and predefined SKUs are included, custom ones include "Custom" in their description | ||
strings and predefined SKUs are only identifiable by the absence of "Custom." | ||
*/ | ||
def fromSku(sku: Sku): Option[MachineCustomization] = { | ||
def fromCpuOrRamSku(sku: Sku): MachineCustomization = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seemed unnecessary to have this method return an |
||
val tokenizedDescription = sku.getDescription.toLowerCase.split(" ") | ||
|
||
// ex. "N1 Predefined Instance Core running in Montreal" | ||
if (tokenizedDescription.contains(Predefined.customizationName)) Some(Predefined) | ||
if (tokenizedDescription.contains(Predefined.customizationName)) Predefined | ||
// ex. "N2 Custom Instance Core running in Paris" | ||
else if (tokenizedDescription.contains(Custom.customizationName)) Some(Custom) | ||
else if (tokenizedDescription.contains(Custom.customizationName)) Custom | ||
// ex. "N2 Instance Core running in Paris" | ||
else Some(Predefined) | ||
else Predefined | ||
} | ||
} | ||
|
||
sealed trait ResourceType { def groupName: String } | ||
case object Cpu extends ResourceType { override val groupName = "cpu" } | ||
case object Ram extends ResourceType { override val groupName = "ram" } | ||
case object Gpu extends ResourceType { override val groupName = "gpu" } | ||
|
||
object ResourceType { | ||
def fromSku(sku: Sku): Option[ResourceType] = { | ||
val tokenizedDescription = sku.getDescription.toLowerCase.split(" ") | ||
sku.getCategory.getResourceGroup.toLowerCase match { | ||
case Cpu.groupName => Some(Cpu) | ||
case Ram.groupName => Some(Ram) | ||
case Gpu.groupName => Some(Gpu) | ||
case "n1standard" if tokenizedDescription.contains("ram") => Some(Ram) | ||
case "n1standard" if tokenizedDescription.contains("core") => Some(Cpu) | ||
case _ => Option.empty | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently, since we use the
findFirstIn
function below, omitting the.*?
does not stop us from matching SKU descriptions that have text before "Nvidia Tesla." Really, as long as this does not affect CPU and RAM SKUs, we should change that function to require the full description to match the regex because those GPU SKUs with extra text at the start are for commitment pricing and are not needed.We should also verify my conclusion that all the GPU SKUs we need look like "Nvidia Tesla T4 GPU running in Paris" or "Nvidia Tesla P4 GPU attached to Spot Preemptible VMs running in Milan."