diff --git a/data-connector-lib/src/dpk_connector/core/crawler.py b/data-connector-lib/src/dpk_connector/core/crawler.py index 4e706a944..4a6dccef8 100644 --- a/data-connector-lib/src/dpk_connector/core/crawler.py +++ b/data-connector-lib/src/dpk_connector/core/crawler.py @@ -13,12 +13,12 @@ import threading from typing import Any, Callable, Collection, Type, cast +from dpk_connector.core.utils import validate_domain, validate_url from scrapy import Spider from scrapy.crawler import Crawler, CrawlerRunner from scrapy.settings import Settings from twisted.internet.defer import Deferred -from dpk_connector.core.utils import validate_domain, validate_url _lock = threading.Lock() _reactor_initialized = False @@ -61,9 +61,7 @@ def _create_crawler(self, spidercls: str | type[Spider]) -> Crawler: with _lock: global _reactor_initialized init_reactor = not _reactor_initialized - crawler = Crawler( - cast(Type[Spider], spidercls), self.settings, init_reactor - ) + crawler = Crawler(cast(Type[Spider], spidercls), self.settings, init_reactor) _reactor_initialized = True return crawler @@ -140,9 +138,7 @@ def async_crawl( if concurrent_requests < 1: raise ValueError(f"Invalid concurrent requests {concurrent_requests}") if concurrent_requests_per_domain < 1: - raise ValueError( - f"Invalid concurrent reuqests per domain {concurrent_requests_per_domain}" - ) + raise ValueError(f"Invalid concurrent reuqests per domain {concurrent_requests_per_domain}") if download_delay < 0: raise ValueError(f"Invalid download delay {download_delay}") if download_timeout < 0: @@ -150,9 +146,7 @@ def async_crawl( if autothrottle_max_delay < 0: raise ValueError(f"Invalid autothrottle max delay {autothrottle_max_delay}") if autothrottle_target_concurrency < 1: - raise ValueError( - f"Invalid autothrottle target concurrency {autothrottle_target_concurrency}" - ) + raise ValueError(f"Invalid autothrottle target concurrency {autothrottle_target_concurrency}") if robots_max_crawl_delay < 0: raise ValueError(f"Invalid robots max crawl delay {robots_max_crawl_delay}") @@ -178,9 +172,7 @@ def async_crawl( priority="spider", ) settings.set("DOWNLOAD_DELAY", download_delay, priority="spider") - settings.set( - "RANDOMIZE_DOWNLOAD_DELAY", randomize_download_delay, priority="spider" - ) + settings.set("RANDOMIZE_DOWNLOAD_DELAY", randomize_download_delay, priority="spider") settings.set("DOWNLOAD_TIMEOUT", download_timeout, priority="spider") settings.set("AUTOTHROTTLE_ENABLED", autothrottle_enabled, priority="spider") settings.set("AUTOTHROTTLE_MAX_DELAY", autothrottle_max_delay, priority="spider") diff --git a/data-connector-lib/src/dpk_connector/core/middlewares.py b/data-connector-lib/src/dpk_connector/core/middlewares.py index 7d0738b79..a9ce867bd 100644 --- a/data-connector-lib/src/dpk_connector/core/middlewares.py +++ b/data-connector-lib/src/dpk_connector/core/middlewares.py @@ -13,6 +13,13 @@ import logging from typing import Any, Generator, Iterable +from dpk_connector.core.item import ConnectorItem +from dpk_connector.core.utils import ( + get_content_type, + get_etld1, + get_mime_type, + get_netloc, +) from scrapy import Spider, signals from scrapy.crawler import Crawler from scrapy.downloadermiddlewares.robotstxt import RobotsTxtMiddleware @@ -26,8 +33,6 @@ from scrapy.utils.python import to_unicode from twisted.internet.defer import Deferred -from dpk_connector.core.item import ConnectorItem -from dpk_connector.core.utils import get_content_type, get_etld1, get_mime_type, get_netloc logger = logging.getLogger(__name__) @@ -48,9 +53,7 @@ def delay(self, user_agent: str | bytes) -> float | None: if crawl_delay is None and request_rate is None: return None crawl_delay = crawl_delay or 0 - request_rate = ( - request_rate.seconds / request_rate.requests if request_rate else 0 - ) + request_rate = request_rate.seconds / request_rate.requests if request_rate else 0 delay = min(max(crawl_delay, request_rate), self.max_delay) return delay @@ -64,9 +67,7 @@ def __init__(self, crawler: Crawler, download_timeout: float): super().__init__(crawler) self.download_timeout = download_timeout self._delays: dict[str, float] = {} - crawler.signals.connect( - self._request_reached_downloader, signal=signals.request_reached_downloader - ) + crawler.signals.connect(self._request_reached_downloader, signal=signals.request_reached_downloader) @classmethod def from_crawler(cls, crawler: Crawler): @@ -86,9 +87,7 @@ def _request_reached_downloader(self, request: Request, spider: Spider) -> None: slot.delay = delay slot.randomize_delay = False - def process_request_2( - self, rp: RobotParser, request: Request, spider: Spider - ) -> None: + def process_request_2(self, rp: RobotParser, request: Request, spider: Spider) -> None: super().process_request_2(rp, request, spider) if isinstance(rp, DelayingProtegoRobotParser): parts = urlparse_cached(request) @@ -96,15 +95,11 @@ def process_request_2( if domain not in self._delays: user_agent = self._robotstxt_useragent if not user_agent: - user_agent = request.headers.get( - b"User-Agent", self._default_useragent - ) + user_agent = request.headers.get(b"User-Agent", self._default_useragent) delay = rp.delay(user_agent) or 0.0 self._delays[domain] = delay if delay: - logger.info( - f"Set download delay to {delay} according to robots.txt. domain: {domain}" - ) + logger.info(f"Set download delay to {delay} according to robots.txt. domain: {domain}") def robot_parser(self, request: Request, spider: Spider): url = urlparse_cached(request) @@ -207,9 +202,7 @@ def process_request(self, request: Request, spider: Spider): super().process_request(request, spider) prefix = "dpk_connector/requested" if not request.meta.get("system_request", False): - _update_request_stats( - self.stats, request, spider, prefix, self.skip_domains - ) + _update_request_stats(self.stats, request, spider, prefix, self.skip_domains) if request.meta.get("sitemap", False): _update_sitemap_stats(self.stats, spider, prefix) @@ -217,9 +210,7 @@ def process_response(self, request: Request, response: Response, spider: Spider) ret = super().process_response(request, response, spider) prefix = "dpk_connector/accessed" if not request.meta.get("system_request", False): - _update_stats( - self.stats, request, response, spider, prefix, self.skip_domains - ) + _update_stats(self.stats, request, response, spider, prefix, self.skip_domains) if request.meta.get("sitemap", False): _update_sitemap_stats(self.stats, spider, prefix) return ret diff --git a/data-connector-lib/src/dpk_connector/core/pipelines.py b/data-connector-lib/src/dpk_connector/core/pipelines.py index b9c35d896..4fe4d5c97 100644 --- a/data-connector-lib/src/dpk_connector/core/pipelines.py +++ b/data-connector-lib/src/dpk_connector/core/pipelines.py @@ -11,12 +11,12 @@ ################################################################################ from typing import Any + +from dpk_connector.core.item import ConnectorItem from scrapy import Spider from scrapy.crawler import Crawler from scrapy.exceptions import DropItem -from dpk_connector.core.item import ConnectorItem - class DropPipeline: @classmethod diff --git a/data-connector-lib/src/dpk_connector/core/spiders/sitemap.py b/data-connector-lib/src/dpk_connector/core/spiders/sitemap.py index de18ab596..78b7dd830 100644 --- a/data-connector-lib/src/dpk_connector/core/spiders/sitemap.py +++ b/data-connector-lib/src/dpk_connector/core/spiders/sitemap.py @@ -14,14 +14,6 @@ from typing import Any, Callable, Collection, Generator from urllib.parse import ParseResult -from scrapy import Request -from scrapy.http import Response -from scrapy.link import Link -from scrapy.linkextractors import LinkExtractor -from scrapy.spiders import SitemapSpider -from scrapy.spiders.sitemap import iterloc -from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots - from dpk_connector.core.item import ConnectorItem from dpk_connector.core.utils import ( get_base_url, @@ -32,6 +24,13 @@ is_allowed_path, urlparse_cached, ) +from scrapy import Request +from scrapy.http import Response +from scrapy.link import Link +from scrapy.linkextractors import LinkExtractor +from scrapy.spiders import SitemapSpider +from scrapy.spiders.sitemap import iterloc +from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots class BaseSitemapSpider(SitemapSpider): @@ -99,13 +98,9 @@ def __init__( self.allowed_domains.add(fqdn) else: self.allowed_domains = set(get_etld1(url) for url in seed_urls) - self.allow_mime_types = set( - [m.lower() for m in allow_mime_types] if len(allow_mime_types) > 0 else () - ) + self.allow_mime_types = set([m.lower() for m in allow_mime_types] if len(allow_mime_types) > 0 else ()) self.disallow_mime_types = set( - [m.lower() for m in disallow_mime_types] - if len(disallow_mime_types) > 0 - else () + [m.lower() for m in disallow_mime_types] if len(disallow_mime_types) > 0 else () ) # Link extraction from html @@ -161,15 +156,11 @@ def start_requests(self): ) def _parse_sitemap(self, response: Response): - yield ConnectorItem( - dropped=False, downloaded=False, system_request=True, sitemap=True - ) + yield ConnectorItem(dropped=False, downloaded=False, system_request=True, sitemap=True) seed_url = response.meta["seed_url"] - if response.url.endswith("/robots.txt") or response.url.endswith( - "/robots.txt/" - ): + if response.url.endswith("/robots.txt") or response.url.endswith("/robots.txt/"): for url in sitemap_urls_from_robots(response.text, base_url=response.url): yield Request( url, @@ -197,9 +188,7 @@ def _parse_sitemap(self, response: Response): if s.type == "sitemapindex": for loc in iterloc(it, self.sitemap_alternate_links): - if any( - x.search(loc) for x in self._follow - ) and self._is_allowed_path(loc): + if any(x.search(loc) for x in self._follow) and self._is_allowed_path(loc): yield Request( loc, callback=self._parse_sitemap, @@ -244,9 +233,7 @@ def _should_download(self, content_type: str | None) -> bool: return not self._is_disallowed_content_type(ctype) if not self.disallow_mime_types: return self._is_allowed_content_type(ctype) - return ( - not self._is_disallowed_content_type(ctype) - ) and self._is_allowed_content_type(ctype) + return (not self._is_disallowed_content_type(ctype)) and self._is_allowed_content_type(ctype) def _explore_sitemap(self, response: Response) -> Generator[Request, Any, None]: depth = response.meta.get("depth", 0) @@ -255,9 +242,7 @@ def _explore_sitemap(self, response: Response) -> Generator[Request, Any, None]: parts = urlparse_cached(response) domain = parts.netloc if domain not in self.sitemaps_seen: - self.log( - f"New domain {domain} found. Search for sitemap.", logging.INFO - ) + self.log(f"New domain {domain} found. Search for sitemap.", logging.INFO) self.sitemaps_seen.add(domain) for sitemap in self._get_sitemap_urls(parts): yield Request( @@ -272,9 +257,7 @@ def _explore_sitemap(self, response: Response) -> Generator[Request, Any, None]: }, ) - def _explore_links( - self, response: Response, links: list[Link] - ) -> Generator[Request, Any, None]: + def _explore_links(self, response: Response, links: list[Link]) -> Generator[Request, Any, None]: depth = response.meta.get("depth", 0) depth_limit = self.depth_limit if depth_limit == 0 or depth < depth_limit: @@ -303,9 +286,7 @@ def __init__( self.callback = callback - def parse( - self, response: Response, **kwargs: Any - ) -> Generator[Request | ConnectorItem, Any, None]: + def parse(self, response: Response, **kwargs: Any) -> Generator[Request | ConnectorItem, Any, None]: drop = False content_type = get_content_type(response) if not content_type: @@ -315,24 +296,16 @@ def parse( if not (is_html or should_download): drop = True if drop: - yield ConnectorItem( - dropped=True, downloaded=False, system_request=False, sitemap=False - ) + yield ConnectorItem(dropped=True, downloaded=False, system_request=False, sitemap=False) return # Download contents if should_download: - self.callback( - str(response.url), response.body, response.headers.to_unicode_dict() - ) + self.callback(str(response.url), response.body, response.headers.to_unicode_dict()) # to count up downloaded pages and collect stats - yield ConnectorItem( - dropped=False, downloaded=True, system_request=False, sitemap=False - ) + yield ConnectorItem(dropped=False, downloaded=True, system_request=False, sitemap=False) else: - yield ConnectorItem( - dropped=False, downloaded=False, system_request=False, sitemap=False - ) + yield ConnectorItem(dropped=False, downloaded=False, system_request=False, sitemap=False) # Search for sitemap yield from self._explore_sitemap(response) diff --git a/data-processing-lib/python/src/data_processing/data_access/arrow_s3.py b/data-processing-lib/python/src/data_processing/data_access/arrow_s3.py index edf5dbc1b..6fa2da432 100644 --- a/data-processing-lib/python/src/data_processing/data_access/arrow_s3.py +++ b/data-processing-lib/python/src/data_processing/data_access/arrow_s3.py @@ -95,6 +95,7 @@ def list_folders(self, key: str) -> tuple[list[str], int]: :param key: complete folder :return: list of folders within a given folder and number of retries """ + def _get_sub_folders(bck: str, p: str) -> tuple[list[str], int]: sub_folders = [] # use paginator @@ -113,6 +114,7 @@ def _get_sub_folders(bck: str, p: str) -> tuple[list[str], int]: internal_retries += r sub_folders.extend(sf) return sub_folders, internal_retries + bucket, prefix = self._get_bucket_key(key) subs, retries = _get_sub_folders(bck=bucket, p=prefix) return [f"{bucket}/{f}" for f in subs], retries diff --git a/data-processing-lib/python/src/data_processing/data_access/data_access.py b/data-processing-lib/python/src/data_processing/data_access/data_access.py index 51d7b54b8..4a60d0a75 100644 --- a/data-processing-lib/python/src/data_processing/data_access/data_access.py +++ b/data-processing-lib/python/src/data_processing/data_access/data_access.py @@ -14,21 +14,22 @@ from typing import Any import pyarrow as pa -from data_processing.utils import KB, MB, GB, TransformUtils, get_logger +from data_processing.utils import GB, KB, MB, TransformUtils, get_logger class DataAccess: """ Base class for data access (interface), defining all the methods """ + def __init__( - self, - d_sets: list[str], - checkpoint: bool, - m_files: int, - n_samples: int, - files_to_use: list[str], - files_to_checkpoint: list[str], + self, + d_sets: list[str], + checkpoint: bool, + m_files: int, + n_samples: int, + files_to_use: list[str], + files_to_checkpoint: list[str], ): """ Create data access class for folder based configuration @@ -118,7 +119,11 @@ def _get_files_to_process_internal(self) -> tuple[list[str], dict[str, float], i if self.d_sets is not None: # get folders for the input folders_to_use, retries = self._get_folders_to_use() - profile = {"max_file_size": 0.0, "min_file_size": 0.0, "total_file_size": 0.0} + profile = { + "max_file_size": 0.0, + "min_file_size": 0.0, + "total_file_size": 0.0, + } if len(folders_to_use) > 0: # if we have valid folders path_list = [] @@ -163,12 +168,12 @@ def _get_folders_to_use(self) -> tuple[list[str], int]: raise NotImplementedError("Subclasses should implement this!") def _get_files_folder( - self, - path: str, - files_to_use: list[str], - cm_files: int, - max_file_size: int = 0, - min_file_size: int = MB * GB + self, + path: str, + files_to_use: list[str], + cm_files: int, + max_file_size: int = 0, + min_file_size: int = MB * GB, ) -> tuple[list[dict[str, Any]], dict[str, float], int]: """ Support method to get list input files and their profile @@ -212,12 +217,12 @@ def _get_files_folder( ) def _get_input_files( - self, - input_path: str, - output_path: str, - cm_files: int, - max_file_size: int = 0, - min_file_size: int = MB * GB, + self, + input_path: str, + output_path: str, + cm_files: int, + max_file_size: int = 0, + min_file_size: int = MB * GB, ) -> tuple[list[str], dict[str, float], int]: """ Get list and size of files from input path, that do not exist in the output path @@ -240,17 +245,16 @@ def _get_input_files( pout_list, _, retries1 = self._get_files_folder( path=output_path, files_to_use=self.files_to_checkpoint, cm_files=-1 ) - output_base_names_ext = [file["name"].replace(self.get_output_folder(), self.get_input_folder()) - for file in pout_list] + output_base_names_ext = [ + file["name"].replace(self.get_output_folder(), self.get_input_folder()) for file in pout_list + ] # In the case of binary transforms, an extension can be different, so just use the file names. # Also remove duplicates output_base_names = list(set([TransformUtils.get_file_extension(file)[0] for file in output_base_names_ext])) p_list = [] total_input_file_size = 0 i = 0 - files, _, retries = self._get_files_folder( - path=input_path, files_to_use=self.files_to_use, cm_files=-1 - ) + files, _, retries = self._get_files_folder(path=input_path, files_to_use=self.files_to_use, cm_files=-1) retries += retries1 for file in files: if i >= cm_files > 0: @@ -318,6 +322,7 @@ def get_folder_files( directory is returned (False) :return: A dictionary of file names/binary content will be returned """ + def _get_file_content(name: str, dt: bool) -> tuple[bytes, int]: """ return file content @@ -330,9 +335,7 @@ def _get_file_content(name: str, dt: bool) -> tuple[bytes, int]: return None, 0 result = {} - files, _, retries = self._get_files_folder( - path=path, files_to_use=extensions, cm_files=-1 - ) + files, _, retries = self._get_files_folder(path=path, files_to_use=extensions, cm_files=-1) for file in files: f_name = str(file["name"]) b, retries1 = _get_file_content(f_name, return_data) diff --git a/data-processing-lib/python/src/data_processing/data_access/data_access_factory.py b/data-processing-lib/python/src/data_processing/data_access/data_access_factory.py index 2172e3ed0..e34828c74 100644 --- a/data-processing-lib/python/src/data_processing/data_access/data_access_factory.py +++ b/data-processing-lib/python/src/data_processing/data_access/data_access_factory.py @@ -62,7 +62,10 @@ def add_input_params(self, parser: argparse.ArgumentParser) -> None: help_example_dict = { "access_key": ["access", "access key help text"], "secret_key": ["secret", "secret key help text"], - "url": ["https://s3.us-east.cloud-object-storage.appdomain.cloud", "optional s3 url"], + "url": [ + "https://s3.us-east.cloud-object-storage.appdomain.cloud", + "optional s3 url", + ], "region": ["us-east-1", "optional s3 region"], } parser.add_argument( @@ -94,8 +97,14 @@ def __add_data_navigation_params(self, parser): help="AST string containing input/output paths.\n" + ParamsUtils.get_ast_help_text(help_example_dict), ) help_example_dict = { - "input_folder": ["./input", "Path to input folder of files to be processed"], - "output_folder": ["/tmp/output", "Path to output folder of processed files"], + "input_folder": [ + "./input", + "Path to input folder of files to be processed", + ], + "output_folder": [ + "/tmp/output", + "Path to output folder of processed files", + ], } parser.add_argument( f"--{self.cli_arg_prefix}local_config", @@ -105,7 +114,10 @@ def __add_data_navigation_params(self, parser): + ParamsUtils.get_ast_help_text(help_example_dict), ) parser.add_argument( - f"--{self.cli_arg_prefix}max_files", type=int, default=-1, help="Max amount of files to process" + f"--{self.cli_arg_prefix}max_files", + type=int, + default=-1, + help="Max amount of files to process", ) parser.add_argument( f"--{self.cli_arg_prefix}checkpointing", @@ -135,7 +147,10 @@ def __add_data_navigation_params(self, parser): help="list of file extensions to choose for input.", ) parser.add_argument( - f"--{self.cli_arg_prefix}num_samples", type=int, default=-1, help="number of random input files to process" + f"--{self.cli_arg_prefix}num_samples", + type=int, + default=-1, + help="number of random input files to process", ) def apply_input_params(self, args: Union[dict, argparse.Namespace]) -> bool: diff --git a/data-processing-lib/python/src/data_processing/data_access/data_access_local.py b/data-processing-lib/python/src/data_processing/data_access/data_access_local.py index d37e571a3..f757d655b 100644 --- a/data-processing-lib/python/src/data_processing/data_access/data_access_local.py +++ b/data-processing-lib/python/src/data_processing/data_access/data_access_local.py @@ -50,8 +50,14 @@ def __init__( :param files_to_use: files extensions of files to include :param files_to_checkpoint: files extensions of files to use for checkpointing """ - super().__init__(d_sets=d_sets, checkpoint=checkpoint, m_files=m_files, n_samples=n_samples, - files_to_use=files_to_use, files_to_checkpoint=files_to_checkpoint) + super().__init__( + d_sets=d_sets, + checkpoint=checkpoint, + m_files=m_files, + n_samples=n_samples, + files_to_use=files_to_use, + files_to_checkpoint=files_to_checkpoint, + ) if local_config is None: self.input_folder = None self.output_folder = None diff --git a/data-processing-lib/python/src/data_processing/data_access/data_access_s3.py b/data-processing-lib/python/src/data_processing/data_access/data_access_s3.py index 8ddc772c5..231470e00 100644 --- a/data-processing-lib/python/src/data_processing/data_access/data_access_s3.py +++ b/data-processing-lib/python/src/data_processing/data_access/data_access_s3.py @@ -46,8 +46,14 @@ def __init__( :param files_to_use: files extensions of files to include :param files_to_checkpoint: files extensions of files to use for checkpointing """ - super().__init__(d_sets=d_sets, checkpoint=checkpoint, m_files=m_files, n_samples=n_samples, - files_to_use=files_to_use, files_to_checkpoint=files_to_checkpoint) + super().__init__( + d_sets=d_sets, + checkpoint=checkpoint, + m_files=m_files, + n_samples=n_samples, + files_to_use=files_to_use, + files_to_checkpoint=files_to_checkpoint, + ) if ( s3_credentials is None or s3_credentials.get("access_key", None) is None @@ -164,7 +170,10 @@ def save_job_metadata(self, metadata: dict[str, Any]) -> tuple[dict[str, Any], i return None, 0 metadata["source"] = {"name": self.input_folder, "type": "path"} metadata["target"] = {"name": self.output_folder, "type": "path"} - return self.save_file(path=f"{self.output_folder}metadata.json", data=json.dumps(metadata, indent=2).encode()) + return self.save_file( + path=f"{self.output_folder}metadata.json", + data=json.dumps(metadata, indent=2).encode(), + ) def get_file(self, path: str) -> tuple[bytes, int]: """ @@ -193,4 +202,4 @@ def save_file(self, path: str, data: bytes) -> tuple[dict[str, Any], int]: try: return self.arrS3.save_file(key=path, data=data) except Exception as e: - self.logger.error(f"Exception saving file {path} - {e}") \ No newline at end of file + self.logger.error(f"Exception saving file {path} - {e}") diff --git a/data-processing-lib/python/src/data_processing/runtime/__init__.py b/data-processing-lib/python/src/data_processing/runtime/__init__.py index 7fb42a33a..540ba4a76 100644 --- a/data-processing-lib/python/src/data_processing/runtime/__init__.py +++ b/data-processing-lib/python/src/data_processing/runtime/__init__.py @@ -1,4 +1,12 @@ -from data_processing.runtime.execution_configuration import TransformExecutionConfiguration, runtime_cli_prefix +from data_processing.runtime.execution_configuration import ( + TransformExecutionConfiguration, + runtime_cli_prefix, +) from data_processing.runtime.runtime_configuration import TransformRuntimeConfiguration -from data_processing.runtime.transform_launcher import AbstractTransformLauncher, multi_launcher -from data_processing.runtime.transform_file_processor import AbstractTransformFileProcessor +from data_processing.runtime.transform_launcher import ( + AbstractTransformLauncher, + multi_launcher, +) +from data_processing.runtime.transform_file_processor import ( + AbstractTransformFileProcessor, +) diff --git a/data-processing-lib/python/src/data_processing/runtime/execution_configuration.py b/data-processing-lib/python/src/data_processing/runtime/execution_configuration.py index 0762843ce..fa484f2b5 100644 --- a/data-processing-lib/python/src/data_processing/runtime/execution_configuration.py +++ b/data-processing-lib/python/src/data_processing/runtime/execution_configuration.py @@ -45,7 +45,12 @@ def add_input_params(self, parser: argparse.ArgumentParser) -> None: :param parser: parser :return: """ - parser.add_argument(f"--{runtime_cli_prefix}pipeline_id", type=str, default="pipeline_id", help="pipeline id") + parser.add_argument( + f"--{runtime_cli_prefix}pipeline_id", + type=str, + default="pipeline_id", + help="pipeline id", + ) parser.add_argument(f"--{runtime_cli_prefix}job_id", type=str, default="job_id", help="job id") help_example_dict = { diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/__init__.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/__init__.py index eedaad84d..8e7ef1f3a 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/__init__.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/__init__.py @@ -1,10 +1,21 @@ -from data_processing.runtime.pure_python.transform_runtime import DefaultPythonTransformRuntime -from data_processing.runtime.pure_python.runtime_configuration import PythonTransformRuntimeConfiguration -from data_processing.runtime.pure_python.execution_configuration import PythonTransformExecutionConfiguration +from data_processing.runtime.pure_python.transform_runtime import ( + DefaultPythonTransformRuntime, +) +from data_processing.runtime.pure_python.runtime_configuration import ( + PythonTransformRuntimeConfiguration, +) +from data_processing.runtime.pure_python.execution_configuration import ( + PythonTransformExecutionConfiguration, +) from data_processing.runtime.pure_python.transform_file_processor import ( PythonTransformFileProcessor, PythonPoolTransformFileProcessor, ) from data_processing.runtime.pure_python.transform_orchestrator import orchestrate -from data_processing.runtime.pure_python.transform_launcher import PythonTransformLauncher -from data_processing.runtime.pure_python.transform_invoker import invoke_transform, execute_python_transform +from data_processing.runtime.pure_python.transform_launcher import ( + PythonTransformLauncher, +) +from data_processing.runtime.pure_python.transform_invoker import ( + invoke_transform, + execute_python_transform, +) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/execution_configuration.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/execution_configuration.py index ebb3a40c4..ce6f0b3fc 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/execution_configuration.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/execution_configuration.py @@ -41,7 +41,12 @@ def add_input_params(self, parser: argparse.ArgumentParser) -> None: :param parser: parser :return: """ - parser.add_argument(f"--{cli_prefix}num_processors", type=int, default=0, help="size of multiprocessing pool") + parser.add_argument( + f"--{cli_prefix}num_processors", + type=int, + default=0, + help="size of multiprocessing pool", + ) return TransformExecutionConfiguration.add_input_params(self, parser=parser) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py index 44ccd0ef0..eb624bf08 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py @@ -69,7 +69,7 @@ def __init__( data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any], transform_class: type[AbstractTransform], - is_folder: bool + is_folder: bool, ): """ Init method diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py index 812be8caf..b383a1ed6 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py @@ -12,11 +12,11 @@ import os import time import traceback -import psutil from datetime import datetime from multiprocessing import Pool from typing import Any +import psutil from data_processing.data_access import DataAccessFactoryBase from data_processing.runtime.pure_python import ( PythonPoolTransformFileProcessor, @@ -24,7 +24,11 @@ PythonTransformFileProcessor, PythonTransformRuntimeConfiguration, ) -from data_processing.transform import AbstractTransform, TransformStatistics, AbstractFolderTransform +from data_processing.transform import ( + AbstractFolderTransform, + AbstractTransform, + TransformStatistics, +) from data_processing.utils import GB, get_logger @@ -41,7 +45,7 @@ def _execution_resources() -> dict[str, Any]: # Getting memory used mused = round(psutil.virtual_memory()[3] / GB, 2) return { - "cpus": round((load15/os.cpu_count()) * 100, 1), + "cpus": round((load15 / os.cpu_count()) * 100, 1), "gpus": 0, "memory": mused, "object_store": 0, @@ -100,7 +104,9 @@ def orchestrate( data_access_factory=data_access_factory, print_interval=print_interval, transform_params=runtime.get_transform_config( - data_access_factory=data_access_factory, statistics=statistics, files=files + data_access_factory=data_access_factory, + statistics=statistics, + files=files, ), transform_class=runtime_config.get_transform_class(), is_folder=is_folder, @@ -113,7 +119,9 @@ def orchestrate( print_interval=print_interval, statistics=statistics, transform_params=runtime.get_transform_config( - data_access_factory=data_access_factory, statistics=statistics, files=files + data_access_factory=data_access_factory, + statistics=statistics, + files=files, ), transform_class=runtime_config.get_transform_class(), is_folder=is_folder, @@ -145,8 +153,8 @@ def orchestrate( "job_input_params": input_params | data_access_factory.get_input_params() | execution_config.get_input_params(), - "execution_stats": _execution_resources() | - {"execution time, min": round((time.time() - start_time) / 60.0, 3)}, + "execution_stats": _execution_resources() + | {"execution time, min": round((time.time() - start_time) / 60.0, 3)}, "job_output_stats": stats, } logger.debug(f"Saving job metadata: {metadata}.") @@ -211,7 +219,7 @@ def _process_transforms_multiprocessor( data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any], transform_class: type[AbstractTransform], - is_folder: bool + is_folder: bool, ) -> TransformStatistics: """ Process transforms using multiprocessing pool diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py index 478d40837..dd3d9ddb6 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase, DataAccess +from data_processing.data_access import DataAccess, DataAccessFactoryBase from data_processing.transform import TransformStatistics @@ -37,7 +37,10 @@ def get_folders(self, data_access: DataAccess) -> list[str]: raise NotImplemented() def get_transform_config( - self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str] + self, + data_access_factory: DataAccessFactoryBase, + statistics: TransformStatistics, + files: list[str], ) -> dict[str, Any]: """ Get the dictionary of configuration that will be provided to the transform's initializer. diff --git a/data-processing-lib/python/src/data_processing/runtime/transform_launcher.py b/data-processing-lib/python/src/data_processing/runtime/transform_launcher.py index 4c3abbd83..07b8eea35 100644 --- a/data-processing-lib/python/src/data_processing/runtime/transform_launcher.py +++ b/data-processing-lib/python/src/data_processing/runtime/transform_launcher.py @@ -10,9 +10,9 @@ # limitations under the License. ################################################################################ +import argparse import sys from typing import Any -import argparse from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase from data_processing.runtime import TransformRuntimeConfiguration @@ -68,9 +68,9 @@ def _get_parameters(self, args: argparse.Namespace) -> bool: :return: True if validation passes or False, if not """ return ( - self.runtime_config.apply_input_params(args=args) - and self.execution_config.apply_input_params(args=args) - and self.data_access_factory.apply_input_params(args=args) + self.runtime_config.apply_input_params(args=args) + and self.execution_config.apply_input_params(args=args) + and self.data_access_factory.apply_input_params(args=args) ) def _submit_for_execution(self) -> int: diff --git a/data-processing-lib/python/src/data_processing/test_support/abstract_test.py b/data-processing-lib/python/src/data_processing/test_support/abstract_test.py index 76fd29000..96cf6a0c6 100644 --- a/data-processing-lib/python/src/data_processing/test_support/abstract_test.py +++ b/data-processing-lib/python/src/data_processing/test_support/abstract_test.py @@ -103,7 +103,13 @@ def validate_expected_tables(cls, table_list: list[pa.Table], expected_table_lis cls.validate_expected_row(i, j, r1, r2) @classmethod - def validate_expected_row(cls, table_index: int, row_index: int, test_row: pa.Table, expected_row: pa.Table): + def validate_expected_row( + cls, + table_index: int, + row_index: int, + test_row: pa.Table, + expected_row: pa.Table, + ): """ Compare the two rows for equality, allowing float values to be within a percentage of each other as defined by global _allowed_float_percent_diff. @@ -140,7 +146,11 @@ def validate_expected_row(cls, table_index: int, row_index: int, test_row: pa.Ta assert diff <= allowed_diff, msg @classmethod - def validate_expected_files(cls, files_list: list[tuple[bytes, str]], expected_files_list: list[tuple[bytes, str]]): + def validate_expected_files( + cls, + files_list: list[tuple[bytes, str]], + expected_files_list: list[tuple[bytes, str]], + ): """ Verify with assertion messages that the two lists of Tables are equivalent. :param files_list: @@ -172,7 +182,9 @@ def validate_expected_files(cls, files_list: list[tuple[bytes, str]], expected_f ), f"produced file length {lenf1} vs expected {lenf2}, exceeds allowance of {diff_allowed}" @classmethod - def validate_expected_metadata_lists(cls, metadata: list[dict[str, float]], expected_metadata: list[dict[str, float]]): + def validate_expected_metadata_lists( + cls, metadata: list[dict[str, float]], expected_metadata: list[dict[str, float]] + ): elen = len(expected_metadata) assert len(metadata) == elen, f"Number of metadata dictionaries not the expected of {elen}" for index in range(elen): @@ -238,7 +250,12 @@ def _validate_table_files(cls, parquet1: str, parquet2: str, drop_columns: list[ @classmethod def __confirm_diffs( - cls, src_dir: str, expected_dir: str, diff_files: list, dest_dir: str, drop_columns: list[str] = [] + cls, + src_dir: str, + expected_dir: str, + diff_files: list, + dest_dir: str, + drop_columns: list[str] = [], ): """ Copy all files from the source dir to the dest dir. diff --git a/data-processing-lib/python/src/data_processing/test_support/data_access/data_access_factory_test.py b/data-processing-lib/python/src/data_processing/test_support/data_access/data_access_factory_test.py index 16066d8d7..cceefe3fb 100644 --- a/data-processing-lib/python/src/data_processing/test_support/data_access/data_access_factory_test.py +++ b/data-processing-lib/python/src/data_processing/test_support/data_access/data_access_factory_test.py @@ -28,7 +28,11 @@ def _verify_files(rootdir, expected_files, found_files): class AbstractDataAccessFactoryTests: def test_get_all(self): params = {} - expected_files = ["ds1/sample1.parquet", "ds1/sample2.parquet", "ds2/sample3.parquet"] + expected_files = [ + "ds1/sample1.parquet", + "ds1/sample2.parquet", + "ds2/sample3.parquet", + ] self._run_test(params, expected_files) def test_nsamples(self): diff --git a/data-processing-lib/python/src/data_processing/test_support/launch/transform_test.py b/data-processing-lib/python/src/data_processing/test_support/launch/transform_test.py index 77d21fc0d..8ad8c969f 100644 --- a/data-processing-lib/python/src/data_processing/test_support/launch/transform_test.py +++ b/data-processing-lib/python/src/data_processing/test_support/launch/transform_test.py @@ -85,7 +85,10 @@ def _install_test_fixtures(self, metafunc): fixtures[fi] = f + ([],) fi += 1 # Install the fixture, matching the parameter names used by test_transform() method. - metafunc.parametrize("launcher,cli_params,in_table_path,expected_out_table_path,ignore_columns", fixtures) + metafunc.parametrize( + "launcher,cli_params,in_table_path,expected_out_table_path,ignore_columns", + fixtures, + ) def get_test_transform_fixtures(self) -> list[tuple]: """ diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py b/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py index 04d6f3b0f..14d0f7c2c 100644 --- a/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py +++ b/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py @@ -1,11 +1,15 @@ -from data_processing.test_support.transform.table_transform_test import AbstractTableTransformTest -from data_processing.test_support.transform.binary_transform_test import AbstractBinaryTransformTest +from data_processing.test_support.transform.table_transform_test import ( + AbstractTableTransformTest, +) +from data_processing.test_support.transform.binary_transform_test import ( + AbstractBinaryTransformTest, +) from data_processing.test_support.transform.noop_transform import ( NOOPTransform, NOOPTransformConfiguration, - NOOPPythonTransformConfiguration + NOOPPythonTransformConfiguration, ) from data_processing.test_support.transform.noop_folder_transform import ( NOOPFolderTransform, - NOOPFolderPythonTransformConfiguration -) \ No newline at end of file + NOOPFolderPythonTransformConfiguration, +) diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/binary_transform_test.py b/data-processing-lib/python/src/data_processing/test_support/transform/binary_transform_test.py index f7cbb950e..c4b61a7fe 100644 --- a/data-processing-lib/python/src/data_processing/test_support/transform/binary_transform_test.py +++ b/data-processing-lib/python/src/data_processing/test_support/transform/binary_transform_test.py @@ -35,7 +35,10 @@ def _install_test_fixtures(self, metafunc): # Let the sub-class define the specific tests and test data for the transform under test. f = self.get_test_transform_fixtures() # Install the fixture, matching the parameter names used by test_transform() method. - metafunc.parametrize("transform,in_binary_list,expected_binary_list,expected_metadata_list", f) + metafunc.parametrize( + "transform,in_binary_list,expected_binary_list,expected_metadata_list", + f, + ) def test_transform( self, diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py b/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py index 5baab7858..d7eae0095 100644 --- a/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py +++ b/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py @@ -15,12 +15,13 @@ from data_processing.data_access import DataAccess from data_processing.runtime.pure_python import ( + DefaultPythonTransformRuntime, PythonTransformLauncher, PythonTransformRuntimeConfiguration, - DefaultPythonTransformRuntime) +) +from data_processing.test_support.transform import NOOPTransformConfiguration from data_processing.transform import AbstractFolderTransform from data_processing.utils import get_logger -from data_processing.test_support.transform import NOOPTransformConfiguration logger = get_logger(__name__) @@ -94,8 +95,10 @@ def __init__(self): """ Initialization """ - super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform), - runtime_class=NOOPFolderPythonRuntime) + super().__init__( + transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform), + runtime_class=NOOPFolderPythonRuntime, + ) if __name__ == "__main__": diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py b/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py index 2fea35506..71e5a9a99 100644 --- a/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py +++ b/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py @@ -19,7 +19,11 @@ from data_processing.runtime.pure_python.runtime_configuration import ( PythonTransformRuntimeConfiguration, ) -from data_processing.transform import AbstractTableTransform, TransformConfiguration, AbstractTransform +from data_processing.transform import ( + AbstractTableTransform, + AbstractTransform, + TransformConfiguration, +) from data_processing.utils import CLIArgumentProvider, get_logger @@ -69,7 +73,6 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab class NOOPTransformConfiguration(TransformConfiguration): - """ Provides support for configuring and using the associated Transform class include configuration with CLI args. diff --git a/data-processing-lib/python/src/data_processing/transform/__init__.py b/data-processing-lib/python/src/data_processing/transform/__init__.py index 20254e47b..1c772d9a7 100644 --- a/data-processing-lib/python/src/data_processing/transform/__init__.py +++ b/data-processing-lib/python/src/data_processing/transform/__init__.py @@ -3,4 +3,7 @@ from data_processing.transform.binary_transform import AbstractBinaryTransform from data_processing.transform.table_transform import AbstractTableTransform from data_processing.transform.transform_statistics import TransformStatistics -from data_processing.transform.transform_configuration import TransformConfiguration, get_transform_config +from data_processing.transform.transform_configuration import ( + TransformConfiguration, + get_transform_config, +) diff --git a/data-processing-lib/python/src/data_processing/transform/abstract_transform.py b/data-processing-lib/python/src/data_processing/transform/abstract_transform.py index 89db70f42..beeb39a85 100644 --- a/data-processing-lib/python/src/data_processing/transform/abstract_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/abstract_transform.py @@ -10,7 +10,8 @@ # limitations under the License. ################################################################################ + class AbstractTransform: """ Base class for all transform types - """ \ No newline at end of file + """ diff --git a/data-processing-lib/python/src/data_processing/transform/binary_transform.py b/data-processing-lib/python/src/data_processing/transform/binary_transform.py index b313aff2f..780aaacf9 100644 --- a/data-processing-lib/python/src/data_processing/transform/binary_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/binary_transform.py @@ -11,6 +11,7 @@ ################################################################################ from typing import Any + from data_processing.transform import AbstractTransform diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py index caa3bfa52..77bf27bd3 100644 --- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -11,6 +11,7 @@ ################################################################################ from typing import Any + from data_processing.transform import AbstractTransform @@ -37,4 +38,4 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str to metadata. Each element of the return list, is a tuple of the transformed bytes and a string holding the file name to use. """ - raise NotImplemented() \ No newline at end of file + raise NotImplemented() diff --git a/data-processing-lib/python/src/data_processing/transform/transform_configuration.py b/data-processing-lib/python/src/data_processing/transform/transform_configuration.py index a5c9ec9ad..ebb1a08f2 100644 --- a/data-processing-lib/python/src/data_processing/transform/transform_configuration.py +++ b/data-processing-lib/python/src/data_processing/transform/transform_configuration.py @@ -23,7 +23,10 @@ class TransformConfiguration(CLIArgumentProvider): """ def __init__( - self, name: str, transform_class: type[AbstractTransform], remove_from_metadata: list[str] = [] + self, + name: str, + transform_class: type[AbstractTransform], + remove_from_metadata: list[str] = [], ): """ Initialization @@ -72,7 +75,9 @@ def get_transform_params(self) -> dict[str, Any]: def get_transform_config( - transform_configuration: TransformConfiguration, argv: list[str], parser: ArgumentParser = None + transform_configuration: TransformConfiguration, + argv: list[str], + parser: ArgumentParser = None, ): """ Create a transform configuration dictionary using the given Configuration class and dictionary of diff --git a/data-processing-lib/python/src/data_processing/utils/__init__.py b/data-processing-lib/python/src/data_processing/utils/__init__.py index aa1d01b68..9c38f8103 100644 --- a/data-processing-lib/python/src/data_processing/utils/__init__.py +++ b/data-processing-lib/python/src/data_processing/utils/__init__.py @@ -3,6 +3,13 @@ from data_processing.utils.cli_utils import GB, KB, MB, CLIArgumentProvider, str2bool from data_processing.utils.log import get_logger from data_processing.utils.params_utils import ParamsUtils -from data_processing.utils.transform_utils import TransformUtils, RANDOM_SEED, LOCAL_TO_DISK +from data_processing.utils.transform_utils import ( + TransformUtils, + RANDOM_SEED, + LOCAL_TO_DISK, +) from data_processing.utils.pipinstaller import PipInstaller -from data_processing.utils.transform_configurator import TransformRuntime, TransformsConfiguration +from data_processing.utils.transform_configurator import ( + TransformRuntime, + TransformsConfiguration, +) diff --git a/data-processing-lib/python/src/data_processing/utils/multilock.py b/data-processing-lib/python/src/data_processing/utils/multilock.py index 3507ddc7b..abb48618f 100644 --- a/data-processing-lib/python/src/data_processing/utils/multilock.py +++ b/data-processing-lib/python/src/data_processing/utils/multilock.py @@ -122,7 +122,7 @@ def release(self): """ if self.fd is not None: self.thread_lock.acquire() - if self.fd is not None: # Retest now that we have the thread lock. + if self.fd is not None: # Retest now that we have the thread lock. os.close(self.fd) self.fd = None self.thread_lock.release() diff --git a/data-processing-lib/python/src/data_processing/utils/params_utils.py b/data-processing-lib/python/src/data_processing/utils/params_utils.py index d7ce2b940..767282ee6 100644 --- a/data-processing-lib/python/src/data_processing/utils/params_utils.py +++ b/data-processing-lib/python/src/data_processing/utils/params_utils.py @@ -58,7 +58,12 @@ def dict_to_req(d: dict[str, Any], executor: str = "") -> list[str]: return res @staticmethod - def __dict_to_str(dict_val: dict[str, str], initial_indent: str, indent_per_level: str, as_value: bool) -> str: + def __dict_to_str( + dict_val: dict[str, str], + initial_indent: str, + indent_per_level: str, + as_value: bool, + ) -> str: all_text = "" if as_value: all_text = all_text + "{ " diff --git a/data-processing-lib/python/test/data_processing_tests/data_access/daf_local_test.py b/data-processing-lib/python/test/data_processing_tests/data_access/daf_local_test.py index 37b00fa4b..e3650f3c4 100644 --- a/data-processing-lib/python/test/data_processing_tests/data_access/daf_local_test.py +++ b/data-processing-lib/python/test/data_processing_tests/data_access/daf_local_test.py @@ -28,10 +28,22 @@ def _get_io_params(self): # -> str, dict: """ params = {} input_folder = os.path.abspath( - os.path.join(os.path.dirname(__file__), "../../../test-data", "data_processing", "daf", "input") + os.path.join( + os.path.dirname(__file__), + "../../../test-data", + "data_processing", + "daf", + "input", + ) ) output_folder = os.path.abspath( - os.path.join(os.path.dirname(__file__), "../../../test-data", "data_processing", "daf", "output") + os.path.join( + os.path.dirname(__file__), + "../../../test-data", + "data_processing", + "daf", + "output", + ) ) local_conf = { "input_folder": input_folder, diff --git a/data-processing-lib/python/test/data_processing_tests/data_access/data_access_local_test.py b/data-processing-lib/python/test/data_processing_tests/data_access/data_access_local_test.py index 7e3e60294..307df7f7c 100644 --- a/data-processing-lib/python/test/data_processing_tests/data_access/data_access_local_test.py +++ b/data-processing-lib/python/test/data_processing_tests/data_access/data_access_local_test.py @@ -31,10 +31,26 @@ class TestInit: "output_folder": os.path.join(os.sep, "tmp", "output_guf"), } dal = DataAccessLocal(path_dict, d_sets=["dset1", "dset2"], checkpoint=True, m_files=-1) - size_stat_dict_empty = {"max_file_size": 0.0, "min_file_size": float(GB), "total_file_size": 0.0} - size_stat_dict = {"max_file_size": 0.0, "min_file_size": 0.0, "total_file_size": 0.0} - size_stat_dict_1 = {"max_file_size": 1.0, "min_file_size": 0.0, "total_file_size": 1.0} - size_stat_dict_1_1 = {"max_file_size": 1.0, "min_file_size": 1.0, "total_file_size": 1.0} + size_stat_dict_empty = { + "max_file_size": 0.0, + "min_file_size": float(GB), + "total_file_size": 0.0, + } + size_stat_dict = { + "max_file_size": 0.0, + "min_file_size": 0.0, + "total_file_size": 0.0, + } + size_stat_dict_1 = { + "max_file_size": 1.0, + "min_file_size": 0.0, + "total_file_size": 1.0, + } + size_stat_dict_1_1 = { + "max_file_size": 1.0, + "min_file_size": 1.0, + "total_file_size": 1.0, + } class TestGetFilesFolder(TestInit): @@ -59,7 +75,11 @@ def test_single_file(self): result = self.dal._get_files_folder(path=str(directory), files_to_use=None, cm_files=0) os.remove(file_path) os.rmdir(directory) - assert result == ([{'name': '/tmp/input_guf/empty_dir/file.parquet', 'size': 0}], self.size_stat_dict, 0) + assert result == ( + [{"name": "/tmp/input_guf/empty_dir/file.parquet", "size": 0}], + self.size_stat_dict, + 0, + ) def test_multiple_files(self): """ @@ -75,7 +95,10 @@ def test_multiple_files(self): file_list, size_dict, retries = self.dal._get_files_folder(path=str(directory), files_to_use=None, cm_files=0) results = ([f["name"] for f in file_list], size_dict) - expected_results = ([str(file.absolute()) for file in files], self.size_stat_dict_1) + expected_results = ( + [str(file.absolute()) for file in files], + self.size_stat_dict_1, + ) for file in files: os.remove(file) os.rmdir(directory) @@ -287,10 +310,26 @@ def multiple_missing_files_setup(self): fp.write(" " * MB) file_list, size_dict, _ = self.dal.get_files_to_process() result = (file_list, size_dict) - return result, in_files_1, in_files_2, out_file_2, in_path_1, out_path_1, in_path_2, out_path_2 + return ( + result, + in_files_1, + in_files_2, + out_file_2, + in_path_1, + out_path_1, + in_path_2, + out_path_2, + ) def multiple_missing_files_cleanup( - self, in_files_1, in_files_2, out_file_2, in_path_1, out_path_1, in_path_2, out_path_2 + self, + in_files_1, + in_files_2, + out_file_2, + in_path_1, + out_path_1, + in_path_2, + out_path_2, ): files_to_remove = in_files_1 + in_files_2 + [out_file_2] directories_to_remove = [in_path_1, out_path_1, in_path_2, out_path_2] @@ -317,7 +356,13 @@ def test_multiple_missing_files(self): self.size_stat_dict_1, ) self.multiple_missing_files_cleanup( - in_files_1, in_files_2, out_file_2, in_path_1, out_path_1, in_path_2, out_path_2 + in_files_1, + in_files_2, + out_file_2, + in_path_1, + out_path_1, + in_path_2, + out_path_2, ) assert result == expected_result @@ -338,7 +383,13 @@ def test_multiple_missing_files(self): self.size_stat_dict_1, ) self.multiple_missing_files_cleanup( - in_files_1, in_files_2, out_file_2, in_path_1, out_path_1, in_path_2, out_path_2 + in_files_1, + in_files_2, + out_file_2, + in_path_1, + out_path_1, + in_path_2, + out_path_2, ) assert result == expected_result @@ -356,7 +407,13 @@ def test_multiple_missing_files(self): expected_result = ([str(in_files_1[0].absolute())], self.size_stat_dict_1_1) self.multiple_missing_files_cleanup( - in_files_1, in_files_2, out_file_2, in_path_1, out_path_1, in_path_2, out_path_2 + in_files_1, + in_files_2, + out_file_2, + in_path_1, + out_path_1, + in_path_2, + out_path_2, ) assert result == expected_result @@ -403,7 +460,12 @@ class TestReadPyarrowTable(TestInit): data = {"col1": [1, 2, 3], "col2": ["a", "b", "c"]} # Create PyArrow schema - schema = pyarrow.schema([pyarrow.field("col1", pyarrow.int32()), pyarrow.field("col2", pyarrow.string())]) + schema = pyarrow.schema( + [ + pyarrow.field("col1", pyarrow.int32()), + pyarrow.field("col2", pyarrow.string()), + ] + ) # Create PyArrow table table = pyarrow.Table.from_pydict(mapping=data, schema=schema) @@ -513,7 +575,8 @@ def test_save_job_metadata(self): with open(metadata_file_path, "r") as fp: metadata_dict = json.load(fp) self.cleanup( - directories_to_remove=[self.dal.input_folder, self.dal.output_folder], files_to_remove=[metadata_file_path] + directories_to_remove=[self.dal.input_folder, self.dal.output_folder], + files_to_remove=[metadata_file_path], ) assert metadata_file_path == os.path.join(self.dal.output_folder, "metadata.json") assert ( @@ -623,7 +686,10 @@ class TestSaveFile(TestInit): def test_successful_save(self): file_info, _ = self.dal.save_file(self.new_file_path, b"This is new data") - assert file_info == {"name": self.new_file_path, "size": os.path.getsize(self.new_file_path)} + assert file_info == { + "name": self.new_file_path, + "size": os.path.getsize(self.new_file_path), + } os.remove(self.new_file_path) def test_invalid_filename(self): diff --git a/data-processing-lib/python/test/data_processing_tests/data_access/data_access_s3_test.py b/data-processing-lib/python/test/data_processing_tests/data_access/data_access_s3_test.py index 9cff1f6b0..c4027daf7 100644 --- a/data-processing-lib/python/test/data_processing_tests/data_access/data_access_s3_test.py +++ b/data-processing-lib/python/test/data_processing_tests/data_access/data_access_s3_test.py @@ -33,7 +33,10 @@ def _create_and_populate_bucket(d_a: DataAccessS3, input_location: str, n_files: d_a.arrS3.s3_client.create_bucket(Bucket="test") # upload file loc = os.path.abspath( - os.path.join(os.path.dirname(__file__), "../../../test-data/data_processing/input/sample1.parquet") + os.path.join( + os.path.dirname(__file__), + "../../../test-data/data_processing/input/sample1.parquet", + ) ) with open(loc, "rb") as file: bdata = file.read() @@ -48,7 +51,13 @@ def test_table_read_write(): """ with mock_aws(): # create data access - d_a = DataAccessS3(s3_credentials=s3_cred, s3_config=s3_conf, d_sets=None, checkpoint=False, m_files=-1) + d_a = DataAccessS3( + s3_credentials=s3_cred, + s3_config=s3_conf, + d_sets=None, + checkpoint=False, + m_files=-1, + ) # populate bucket input_location = "test/table_read_write/input/" _create_and_populate_bucket(d_a=d_a, input_location=input_location, n_files=1) @@ -98,7 +107,13 @@ def test_files_to_process(): """ with mock_aws(): # create data access - d_a = DataAccessS3(s3_credentials=s3_cred, s3_config=s3_conf, d_sets=None, checkpoint=False, m_files=-1) + d_a = DataAccessS3( + s3_credentials=s3_cred, + s3_config=s3_conf, + d_sets=None, + checkpoint=False, + m_files=-1, + ) # populate bucket _create_and_populate_bucket(d_a=d_a, input_location=f"{s3_conf['input_folder']}dataset=d1/", n_files=4) _create_and_populate_bucket(d_a=d_a, input_location=f"{s3_conf['input_folder']}dataset=d2/", n_files=4) diff --git a/data-processing-lib/python/test/data_processing_tests/data_access/sample_input_data_test.py b/data-processing-lib/python/test/data_processing_tests/data_access/sample_input_data_test.py index b3e74ee20..2a4185a96 100644 --- a/data-processing-lib/python/test/data_processing_tests/data_access/sample_input_data_test.py +++ b/data-processing-lib/python/test/data_processing_tests/data_access/sample_input_data_test.py @@ -22,7 +22,10 @@ def test_table_sampling_data(): """ input_folder = os.path.abspath( - os.path.join(os.path.dirname(__file__), "../../../test-data/data_processing/input_multiple") + os.path.join( + os.path.dirname(__file__), + "../../../test-data/data_processing/input_multiple", + ) ) output_folder = "/tmp" print(input_folder) diff --git a/data-processing-lib/python/test/data_processing_tests/invoker/python_invoker_test.py b/data-processing-lib/python/test/data_processing_tests/invoker/python_invoker_test.py index cd64f14b7..acbfd7d34 100644 --- a/data-processing-lib/python/test/data_processing_tests/invoker/python_invoker_test.py +++ b/data-processing-lib/python/test/data_processing_tests/invoker/python_invoker_test.py @@ -32,10 +32,16 @@ def test_configuration(): def test_execution(): input_dir = os.path.abspath( - os.path.join(os.path.dirname(__file__), "../../../../../transforms/universal/noop/python/test-data/input") + os.path.join( + os.path.dirname(__file__), + "../../../../../transforms/universal/noop/python/test-data/input", + ) ) output_dir = os.path.abspath( - os.path.join(os.path.dirname(__file__), "../../../../../transforms/universal/noop/python/output") + os.path.join( + os.path.dirname(__file__), + "../../../../../transforms/universal/noop/python/output", + ) ) t_configuration = TransformsConfiguration() res = execute_python_transform( diff --git a/data-processing-lib/python/test/data_processing_tests/launch/pure_python/test_noop_python_multiprocessor.py b/data-processing-lib/python/test/data_processing_tests/launch/pure_python/test_noop_python_multiprocessor.py index 5f6a1da60..7392ad6d4 100644 --- a/data-processing-lib/python/test/data_processing_tests/launch/pure_python/test_noop_python_multiprocessor.py +++ b/data-processing-lib/python/test/data_processing_tests/launch/pure_python/test_noop_python_multiprocessor.py @@ -30,8 +30,12 @@ def get_test_transform_fixtures(self) -> list[tuple]: basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) fixtures = [] launcher = PythonTransformLauncher(NOOPPythonTransformConfiguration()) - fixtures.append(( - launcher, - {"noop_sleep_sec": 0, "runtime_num_processors": 2}, - basedir + "/input", basedir + "/expected")) + fixtures.append( + ( + launcher, + {"noop_sleep_sec": 0, "runtime_num_processors": 2}, + basedir + "/input", + basedir + "/expected", + ) + ) return fixtures diff --git a/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py b/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py index e0fdd86c8..bee675092 100644 --- a/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py +++ b/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py @@ -12,11 +12,13 @@ import os +from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) -from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.test_support.transform import NOOPFolderPythonTransformConfiguration +from data_processing.test_support.transform import ( + NOOPFolderPythonTransformConfiguration, +) class TestRayNOOPTransform(AbstractTransformLauncherTest): diff --git a/data-processing-lib/python/test/data_processing_tests/transform/test_noop.py b/data-processing-lib/python/test/data_processing_tests/transform/test_noop.py index 1eb85fe48..1ff5e7514 100644 --- a/data-processing-lib/python/test/data_processing_tests/transform/test_noop.py +++ b/data-processing-lib/python/test/data_processing_tests/transform/test_noop.py @@ -21,7 +21,10 @@ table = pa.Table.from_pydict({"name": pa.array(["Tom", "Dick", "Harry"]), "age": pa.array([0, 1, 2])}) expected_table = table # We're a noop after all. -expected_metadata_list = [{"nfiles": 1, "nrows": 3}, {}] # transform() result # flush() result +expected_metadata_list = [ + {"nfiles": 1, "nrows": 3}, + {}, +] # transform() result # flush() result class TestNOOPTransform(AbstractTableTransformTest): @@ -32,7 +35,17 @@ class TestNOOPTransform(AbstractTableTransformTest): def get_test_transform_fixtures(self) -> list[Tuple]: fixtures = [ - (NOOPTransform({"sleep": 0}), [table], [expected_table], expected_metadata_list), - (NOOPTransform({"sleep": 0}), [table], [expected_table], expected_metadata_list), + ( + NOOPTransform({"sleep": 0}), + [table], + [expected_table], + expected_metadata_list, + ), + ( + NOOPTransform({"sleep": 0}), + [table], + [expected_table], + expected_metadata_list, + ), ] return fixtures diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/__init__.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/__init__.py index d6ec60e69..c3e34ef6c 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/__init__.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/__init__.py @@ -1,9 +1,15 @@ from data_processing_ray.runtime.ray.ray_utils import RayUtils from data_processing_ray.runtime.ray.transform_statistics import TransformStatisticsRay from data_processing_ray.runtime.ray.transform_runtime import DefaultRayTransformRuntime -from data_processing_ray.runtime.ray.runtime_configuration import RayTransformRuntimeConfiguration -from data_processing_ray.runtime.ray.transform_file_processor import RayTransformFileProcessor -from data_processing_ray.runtime.ray.execution_configuration import RayTransformExecutionConfiguration +from data_processing_ray.runtime.ray.runtime_configuration import ( + RayTransformRuntimeConfiguration, +) +from data_processing_ray.runtime.ray.transform_file_processor import ( + RayTransformFileProcessor, +) +from data_processing_ray.runtime.ray.execution_configuration import ( + RayTransformExecutionConfiguration, +) from data_processing_ray.runtime.ray.transform_orchestrator import orchestrate from data_processing_ray.runtime.ray.transform_launcher import RayTransformLauncher from data_processing_ray.runtime.ray.transform_invoker import execute_ray_transform diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/execution_configuration.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/execution_configuration.py index b3ffd479f..6f119ca7f 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/execution_configuration.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/execution_configuration.py @@ -66,7 +66,12 @@ def add_input_params(self, parser: argparse.ArgumentParser) -> None: help="AST string defining worker resource requirements.\n" + ParamsUtils.get_ast_help_text(help_example_dict), ) - parser.add_argument(f"--{cli_prefix}creation_delay", type=int, default=0, help="delay between actor' creation") + parser.add_argument( + f"--{cli_prefix}creation_delay", + type=int, + default=0, + help="delay between actor' creation", + ) return TransformExecutionConfiguration.add_input_params(self, parser=parser) def apply_input_params(self, args: argparse.Namespace) -> bool: diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/ray_utils.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/ray_utils.py index ff6e53892..b15a27dae 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/ray_utils.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/ray_utils.py @@ -98,7 +98,11 @@ def get_available_nodes(available_nodes_gauge: Gauge = None) -> int: @staticmethod def create_actors( - clazz: type, params: dict[str, Any], actor_options: dict[str, Any], n_actors: int, creation_delay: int = 0 + clazz: type, + params: dict[str, Any], + actor_options: dict[str, Any], + n_actors: int, + creation_delay: int = 0, ) -> list[ActorHandle]: """ Create a set of actors @@ -120,7 +124,8 @@ def operator() -> ActorHandle: for i in range(120): time.sleep(1) alive = list_actors( - filters=[("class_name", "=", cls_name), ("state", "=", "ALIVE")], limit=RAY_MAX_ACTOR_LIMIT + filters=[("class_name", "=", cls_name), ("state", "=", "ALIVE")], + limit=RAY_MAX_ACTOR_LIMIT, ) if len(actors) == len(alive): return actors diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py index cdad1309f..122ea6989 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py @@ -35,7 +35,7 @@ def __init__(self, params: dict[str, Any]): super().__init__( data_access_factory=params.get("data_access_factory", None), transform_parameters=dict(params.get("transform_params", {})), - is_folder=params.get("is_folder", False) + is_folder=params.get("is_folder", False), ) # Create statistics self.stats = params.get("statistics", None) diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_launcher.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_launcher.py index e3d59926c..7ec1de7df 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_launcher.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_launcher.py @@ -53,7 +53,10 @@ def _get_arguments(self, parser: argparse.ArgumentParser) -> argparse.Namespace: :return: list of arguments """ parser.add_argument( - "--run_locally", type=lambda x: bool(str2bool(x)), default=False, help="running ray local flag" + "--run_locally", + type=lambda x: bool(str2bool(x)), + default=False, + help="running ray local flag", ) return super()._get_arguments(parser) diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index 28eb2d231..4a5f3df96 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -87,7 +87,9 @@ def orchestrate( "data_access_factory": data_access_factory, "transform_class": runtime_config.get_transform_class(), "transform_params": runtime.get_transform_config( - data_access_factory=data_access_factory, statistics=statistics, files=files + data_access_factory=data_access_factory, + statistics=statistics, + files=files, ), "statistics": statistics, "is_folder": is_folder, @@ -150,7 +152,11 @@ def orchestrate( metadata = { "pipeline": preprocessing_params.pipeline_id, "job details": preprocessing_params.job_details - | {"start_time": start_ts, "end_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "status": status}, + | { + "start_time": start_ts, + "end_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "status": status, + }, "code": preprocessing_params.code_location, "job_input_params": runtime_config.get_transform_metadata() | data_access_factory.get_input_params() diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py index 64479302c..24fb4cd76 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase, DataAccess +from data_processing.data_access import DataAccess, DataAccessFactoryBase from ray.actor import ActorHandle @@ -37,7 +37,10 @@ def get_folders(self, data_access: DataAccess) -> list[str]: raise NotImplemented() def get_transform_config( - self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str] + self, + data_access_factory: DataAccessFactoryBase, + statistics: ActorHandle, + files: list[str], ) -> dict[str, Any]: """ Get the dictionary of configuration that will be provided to the transform's initializer. diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_statistics.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_statistics.py index 2095820c8..edec278ba 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_statistics.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_statistics.py @@ -32,10 +32,12 @@ def __init__(self, params: dict[str, Any]): self.source_files_counter = Counter(name="source_files_processed", description="Total source files processed") self.result_files_counter = Counter(name="result_files_written", description="Total result files written") self.source_documents_counter = Counter( - name="source_documents_processed", description="Total source document processed" + name="source_documents_processed", + description="Total source document processed", ) self.result_documents_counter = Counter( - name="result_documents_written", description="Total result documents written" + name="result_documents_written", + description="Total result documents written", ) self.empty_table_counter = Counter(name="empty_tables", description="Total empty tables read") self.failed_read_counter = Counter(name="failed_read_files", description="Total read failed files") diff --git a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/__init__.py b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/__init__.py index dd095c961..e588e0c71 100644 --- a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/__init__.py +++ b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/__init__.py @@ -1,2 +1,6 @@ -from data_processing_ray.test_support.transform.noop_transform import NOOPRayTransformConfiguration -from data_processing_ray.test_support.transform.noop_folder_transform import NOOPFolderRayTransformConfiguration +from data_processing_ray.test_support.transform.noop_transform import ( + NOOPRayTransformConfiguration, +) +from data_processing_ray.test_support.transform.noop_folder_transform import ( + NOOPFolderRayTransformConfiguration, +) diff --git a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py index 1d084b58a..52074c9ca 100644 --- a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py +++ b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py @@ -11,14 +11,17 @@ ################################################################################ -from data_processing.test_support.transform import NOOPFolderTransform, NOOPTransformConfiguration +from data_processing.data_access import DataAccess +from data_processing.test_support.transform import ( + NOOPFolderTransform, + NOOPTransformConfiguration, +) from data_processing.utils import get_logger from data_processing_ray.runtime.ray import ( + DefaultRayTransformRuntime, RayTransformLauncher, RayTransformRuntimeConfiguration, - DefaultRayTransformRuntime ) -from data_processing.data_access import DataAccess logger = get_logger(__name__) @@ -45,8 +48,10 @@ def __init__(self): """ Initialization """ - super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform), - runtime_class=NOOPFolderRayRuntime) + super().__init__( + transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform), + runtime_class=NOOPFolderRayRuntime, + ) if __name__ == "__main__": diff --git a/data-processing-lib/ray/test/data_processing_ray_tests/invoker/ray_invoker_test.py b/data-processing-lib/ray/test/data_processing_ray_tests/invoker/ray_invoker_test.py index 7a72f07b7..916f240c8 100644 --- a/data-processing-lib/ray/test/data_processing_ray_tests/invoker/ray_invoker_test.py +++ b/data-processing-lib/ray/test/data_processing_ray_tests/invoker/ray_invoker_test.py @@ -32,10 +32,16 @@ def test_configuration(): def test_execution(): input_dir = os.path.abspath( - os.path.join(os.path.dirname(__file__), "../../../../../transforms/universal/noop/ray/test-data/input") + os.path.join( + os.path.dirname(__file__), + "../../../../../transforms/universal/noop/ray/test-data/input", + ) ) output_dir = os.path.abspath( - os.path.join(os.path.dirname(__file__), "../../../../../transforms/universal/noop/ray/output") + os.path.join( + os.path.dirname(__file__), + "../../../../../transforms/universal/noop/ray/output", + ) ) worker_options = {"num_cpus": 0.8} t_configuration = TransformsConfiguration() diff --git a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_folder_launch.py b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_folder_launch.py index cd61c6745..81856ee9e 100644 --- a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_folder_launch.py +++ b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_folder_launch.py @@ -16,7 +16,9 @@ AbstractTransformLauncherTest, ) from data_processing_ray.runtime.ray import RayTransformLauncher -from data_processing_ray.test_support.transform import NOOPFolderRayTransformConfiguration +from data_processing_ray.test_support.transform import ( + NOOPFolderRayTransformConfiguration, +) class TestRayNOOPTransform(AbstractTransformLauncherTest): @@ -29,5 +31,12 @@ def get_test_transform_fixtures(self) -> list[tuple]: basedir = "../../../../test-data/data_processing/ray/noop/" basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) launcher = RayTransformLauncher(NOOPFolderRayTransformConfiguration()) - fixtures = [(launcher, {"noop_sleep_sec": 0, "run_locally": True}, basedir + "/input", basedir + "/expected")] + fixtures = [ + ( + launcher, + {"noop_sleep_sec": 0, "run_locally": True}, + basedir + "/input", + basedir + "/expected", + ) + ] return fixtures diff --git a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py index e706a4dfa..0e7d7d8eb 100644 --- a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py +++ b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py @@ -29,5 +29,12 @@ def get_test_transform_fixtures(self) -> list[tuple]: basedir = "../../../../test-data/data_processing/ray/noop/" basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) launcher = RayTransformLauncher(NOOPRayTransformConfiguration()) - fixtures = [(launcher, {"noop_sleep_sec": 0, "run_locally": True}, basedir + "/input", basedir + "/expected")] + fixtures = [ + ( + launcher, + {"noop_sleep_sec": 0, "run_locally": True}, + basedir + "/input", + basedir + "/expected", + ) + ] return fixtures diff --git a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_util_test.py b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_util_test.py index 694352dff..8353417d9 100644 --- a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_util_test.py +++ b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_util_test.py @@ -83,10 +83,22 @@ def test_actor_creation(): res = support.get_available_resources() print(f"\navailable resources {res}") - execs = support.create_actors(clazz=TransformStatisticsRay, params=params, actor_options=actor_options, n_actors=1) + execs = support.create_actors( + clazz=TransformStatisticsRay, + params=params, + actor_options=actor_options, + n_actors=1, + ) execs[0].add_stats.remote({"source_documents": 1, "source_size": 500}) - execs[0].add_stats.remote({"source_documents": 1, "source_size": 500, "result_documents": 1, "result_size": 300}) + execs[0].add_stats.remote( + { + "source_documents": 1, + "source_size": 500, + "result_documents": 1, + "result_size": 300, + } + ) stats = ray.get(execs[0].get_execution_stats.remote()) print(stats) diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/__init__.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/__init__.py index 9e0d19473..e1baba7b3 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/__init__.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/__init__.py @@ -9,9 +9,19 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ -from data_processing_spark.runtime.spark.transform_runtime import DefaultSparkTransformRuntime -from data_processing_spark.runtime.spark.execution_configuration import SparkTransformExecutionConfiguration -from data_processing_spark.runtime.spark.runtime_configuration import SparkTransformRuntimeConfiguration -from data_processing_spark.runtime.spark.transform_file_processor import SparkTransformFileProcessor +from data_processing_spark.runtime.spark.transform_runtime import ( + DefaultSparkTransformRuntime, +) +from data_processing_spark.runtime.spark.execution_configuration import ( + SparkTransformExecutionConfiguration, +) +from data_processing_spark.runtime.spark.runtime_configuration import ( + SparkTransformRuntimeConfiguration, +) +from data_processing_spark.runtime.spark.transform_file_processor import ( + SparkTransformFileProcessor, +) from data_processing_spark.runtime.spark.transform_orchestrator import orchestrate -from data_processing_spark.runtime.spark.transform_launcher import SparkTransformLauncher +from data_processing_spark.runtime.spark.transform_launcher import ( + SparkTransformLauncher, +) diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/execution_configuration.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/execution_configuration.py index e6fb72128..1a9aebdd7 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/execution_configuration.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/execution_configuration.py @@ -50,7 +50,12 @@ def add_input_params(self, parser: argparse.ArgumentParser) -> None: typically determined based on the cluster configuration or the available resources (number of workers). """ - parser.add_argument(f"--{runtime_cli_prefix}parallelization", type=int, default=-1, help="parallelization.") + parser.add_argument( + f"--{runtime_cli_prefix}parallelization", + type=int, + default=-1, + help="parallelization.", + ) return TransformExecutionConfiguration.add_input_params(self, parser=parser) def apply_input_params(self, args: argparse.Namespace) -> bool: diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py index a0968ab1d..a85608d95 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py @@ -15,8 +15,8 @@ from data_processing.data_access import DataAccessFactoryBase from data_processing.runtime import AbstractTransformFileProcessor from data_processing.transform import TransformStatistics -from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration from data_processing.utils import UnrecoverableException +from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration class SparkTransformFileProcessor(AbstractTransformFileProcessor): diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_launcher.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_launcher.py index 677d14ada..4a64e4355 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_launcher.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_launcher.py @@ -16,9 +16,9 @@ from data_processing.utils import get_logger from data_processing_spark.runtime.spark import ( SparkTransformExecutionConfiguration, + SparkTransformRuntimeConfiguration, orchestrate, ) -from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration logger = get_logger(__name__) @@ -54,7 +54,7 @@ def _submit_for_execution(self) -> int: res = orchestrate( data_access_factory=self.data_access_factory, runtime_config=self.runtime_config, - execution_configuration=self.execution_config + execution_configuration=self.execution_config, ) logger.debug("Completed orchestrator") except Exception as e: diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py index 096fab272..1d7bdb77b 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py @@ -18,7 +18,7 @@ import yaml from data_processing.data_access import DataAccessFactoryBase -from data_processing.transform import TransformStatistics, AbstractFolderTransform +from data_processing.transform import AbstractFolderTransform, TransformStatistics from data_processing.utils import GB, get_logger from data_processing_spark.runtime.spark import ( SparkTransformExecutionConfiguration, @@ -130,7 +130,9 @@ def process_partition(iterator): # add additional parameters transform_params = ( runtime.get_transform_config( - partition=int(f[1]), data_access_factory=d_access_factory, statistics=statistics + partition=int(f[1]), + data_access_factory=d_access_factory, + statistics=statistics, ) | bcast_params ) @@ -153,7 +155,7 @@ def process_partition(iterator): # folder transform runtime = runtime_config.create_transform_runtime() files = runtime.get_folders(data_access=data_access) - logger.info(f"Number of folders is {len(files)}") # Get files to process + logger.info(f"Number of folders is {len(files)}") # Get files to process else: # Get files to process files, profile, retries = data_access.get_files_to_process() @@ -190,7 +192,12 @@ def process_partition(iterator): memory = 0.0 for i in range(executors.size()): memory += executors.toList().apply(i)._2()._1() - resources = {"cpus": cpus, "gpus": 0, "memory": round(memory / GB, 2), "object_store": 0} + resources = { + "cpus": cpus, + "gpus": 0, + "memory": round(memory / GB, 2), + "object_store": 0, + } input_params = runtime_config.get_transform_metadata() | execution_configuration.get_input_params() metadata = { "pipeline": execution_configuration.pipeline_id, diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py index 7410d09d1..59ecd17dc 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase, DataAccess +from data_processing.data_access import DataAccess, DataAccessFactoryBase from data_processing.transform import TransformStatistics @@ -37,7 +37,10 @@ def get_folders(self, data_access: DataAccess) -> list[str]: raise NotImplemented() def get_transform_config( - self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics + self, + partition: int, + data_access_factory: DataAccessFactoryBase, + statistics: TransformStatistics, ) -> dict[str, Any]: """ Get the dictionary of configuration that will be provided to the transform's initializer. diff --git a/data-processing-lib/spark/src/data_processing_spark/test_support/transform/__init__.py b/data-processing-lib/spark/src/data_processing_spark/test_support/transform/__init__.py index 041cb43d6..c653601da 100644 --- a/data-processing-lib/spark/src/data_processing_spark/test_support/transform/__init__.py +++ b/data-processing-lib/spark/src/data_processing_spark/test_support/transform/__init__.py @@ -10,5 +10,9 @@ # limitations under the License. ################################################################################ -from data_processing_spark.test_support.transform.noop_transform import NOOPSparkTransformConfiguration -from data_processing_spark.test_support.transform.noop_folder_transform import NOOPFolderSparkTransformConfiguration +from data_processing_spark.test_support.transform.noop_transform import ( + NOOPSparkTransformConfiguration, +) +from data_processing_spark.test_support.transform.noop_folder_transform import ( + NOOPFolderSparkTransformConfiguration, +) diff --git a/data-processing-lib/spark/src/data_processing_spark/test_support/transform/noop_folder_transform.py b/data-processing-lib/spark/src/data_processing_spark/test_support/transform/noop_folder_transform.py index 9972e0f79..8612dcb4b 100644 --- a/data-processing-lib/spark/src/data_processing_spark/test_support/transform/noop_folder_transform.py +++ b/data-processing-lib/spark/src/data_processing_spark/test_support/transform/noop_folder_transform.py @@ -10,11 +10,17 @@ # limitations under the License. ################################################################################ -from data_processing.test_support.transform import NOOPFolderTransform, NOOPTransformConfiguration -from data_processing.utils import get_logger -from data_processing_spark.runtime.spark import SparkTransformLauncher -from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration, DefaultSparkTransformRuntime from data_processing.data_access import DataAccess +from data_processing.test_support.transform import ( + NOOPFolderTransform, + NOOPTransformConfiguration, +) +from data_processing.utils import get_logger +from data_processing_spark.runtime.spark import ( + DefaultSparkTransformRuntime, + SparkTransformLauncher, + SparkTransformRuntimeConfiguration, +) logger = get_logger(__name__) @@ -41,8 +47,10 @@ def __init__(self): """ Initialization """ - super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform), - runtime_class=NOOPFolderSparkRuntime) + super().__init__( + transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform), + runtime_class=NOOPFolderSparkRuntime, + ) if __name__ == "__main__": diff --git a/data-processing-lib/spark/src/data_processing_spark/test_support/transform/noop_transform.py b/data-processing-lib/spark/src/data_processing_spark/test_support/transform/noop_transform.py index 6054538e2..aa2cc66f7 100644 --- a/data-processing-lib/spark/src/data_processing_spark/test_support/transform/noop_transform.py +++ b/data-processing-lib/spark/src/data_processing_spark/test_support/transform/noop_transform.py @@ -13,8 +13,10 @@ NOOPTransformConfiguration, ) from data_processing.utils import get_logger -from data_processing_spark.runtime.spark import SparkTransformLauncher -from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration +from data_processing_spark.runtime.spark import ( + SparkTransformLauncher, + SparkTransformRuntimeConfiguration, +) logger = get_logger(__name__) diff --git a/data-processing-lib/spark/test/data_processing_spark_tests/launch/spark/test_noop_folder_launch.py b/data-processing-lib/spark/test/data_processing_spark_tests/launch/spark/test_noop_folder_launch.py index c8e3ce40b..fba4ef7ba 100644 --- a/data-processing-lib/spark/test/data_processing_spark_tests/launch/spark/test_noop_folder_launch.py +++ b/data-processing-lib/spark/test/data_processing_spark_tests/launch/spark/test_noop_folder_launch.py @@ -16,7 +16,9 @@ AbstractTransformLauncherTest, ) from data_processing_spark.runtime.spark import SparkTransformLauncher -from data_processing_spark.test_support.transform import NOOPFolderSparkTransformConfiguration +from data_processing_spark.test_support.transform import ( + NOOPFolderSparkTransformConfiguration, +) class TestSparkNOOPTransform(AbstractTransformLauncherTest): diff --git a/examples/kfp-pipelines/superworkflows/ray/kfp_v1/superworkflow_code_sample_wf.py b/examples/kfp-pipelines/superworkflows/ray/kfp_v1/superworkflow_code_sample_wf.py index e98b79151..696dc79b7 100644 --- a/examples/kfp-pipelines/superworkflows/ray/kfp_v1/superworkflow_code_sample_wf.py +++ b/examples/kfp-pipelines/superworkflows/ray/kfp_v1/superworkflow_code_sample_wf.py @@ -61,16 +61,31 @@ def sample_code_ray_orchestrator( p1_orch_fuzzy_dedup_name: str = "fdedup_wf", p1_orch_tokenization_wf_name: str = "tokenization_wf", p2_pipeline_runtime_pipeline_id: str = "pipeline_id", - p2_pipeline_ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": ""}, - p2_pipeline_ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image_pull_secret": ""}, + p2_pipeline_ray_head_options: dict = { + "cpu": 1, + "memory": 4, + "image_pull_secret": "", + }, + p2_pipeline_ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image_pull_secret": "", + }, p2_pipeline_server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", p2_pipeline_input_parent_path: str = "test/code2parquet/input/", p2_pipeline_output_parent_path: str = "test/super/output/", p2_pipeline_parent_path_suffix: str = "", p2_pipeline_additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5, "delete_cluster_delay_minutes": 0}', p2_pipeline_data_s3_access_secret: str = "s3-secret", - p2_pipeline_runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, - p2_pipeline_runtime_actor_options: dict = {'num_cpus': 0.7}, + p2_pipeline_runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, + p2_pipeline_runtime_actor_options: dict = {"num_cpus": 0.7}, p2_pipeline_data_max_files: int = -1, p2_pipeline_data_num_samples: int = -1, # code to parquet step parameters @@ -266,12 +281,20 @@ def _set_component(op: dsl.BaseOp, displaied_name: str, prev_op: dsl.BaseOp = No _set_component(exact_dedup, "exact dedup", code_to_parquet) # document ID doc_id = run_doc_id_op( - name=p1_orch_doc_id_name, prefix="p5_", params=args, host=orch_host, input_folder=exact_dedup.output + name=p1_orch_doc_id_name, + prefix="p5_", + params=args, + host=orch_host, + input_folder=exact_dedup.output, ) _set_component(doc_id, "doc ID", exact_dedup) # fuzzy deduplication fuzzy_dedup = run_fuzzy_dedup_op( - name=p1_orch_fuzzy_dedup_name, prefix="p6_", params=args, host=orch_host, input_folder=doc_id.output + name=p1_orch_fuzzy_dedup_name, + prefix="p6_", + params=args, + host=orch_host, + input_folder=doc_id.output, ) _set_component(fuzzy_dedup, "fuzzy dedup", doc_id) @@ -287,19 +310,31 @@ def _set_component(op: dsl.BaseOp, displaied_name: str, prev_op: dsl.BaseOp = No # code_quality code_quality = run_code_quality_op( - name=p1_orch_code_quality_name, prefix="p8_", params=args, host=orch_host, input_folder=proglang_select.output + name=p1_orch_code_quality_name, + prefix="p8_", + params=args, + host=orch_host, + input_folder=proglang_select.output, ) _set_component(code_quality, "code_quality", proglang_select) # malware malware = run_malware_op( - name=p1_orch_malware_name, prefix="p9_", params=args, host=orch_host, input_folder=code_quality.output + name=p1_orch_malware_name, + prefix="p9_", + params=args, + host=orch_host, + input_folder=code_quality.output, ) _set_component(malware, "malware", code_quality) # license check license_select = run_license_select_op( - name=p1_orch_license_select_name, prefix="p10_", params=args, host=orch_host, input_folder=malware.output + name=p1_orch_license_select_name, + prefix="p10_", + params=args, + host=orch_host, + input_folder=malware.output, ) _set_component(license_select, "license_select", malware) diff --git a/examples/kfp-pipelines/superworkflows/ray/kfp_v1/superworkflow_dedups_sample_wf.py b/examples/kfp-pipelines/superworkflows/ray/kfp_v1/superworkflow_dedups_sample_wf.py index a27eeff17..738644c48 100644 --- a/examples/kfp-pipelines/superworkflows/ray/kfp_v1/superworkflow_dedups_sample_wf.py +++ b/examples/kfp-pipelines/superworkflows/ray/kfp_v1/superworkflow_dedups_sample_wf.py @@ -15,6 +15,7 @@ import kfp.dsl as dsl from workflow_support.compile_utils import ONE_WEEK_SEC + # Components # path to kfp component specifications files component_spec_path = "../../../../../kfp/kfp_ray_components/" @@ -27,6 +28,7 @@ ededup_image = "quay.io/dataprep1/data-prep-kit/ededup-ray:latest" fdedup_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:latest" + # Pipeline to invoke execution on remote resource @dsl.pipeline( name="sample-super-kubeflow-pipeline", @@ -38,16 +40,31 @@ def sample_ray_orchestrator( p1_orch_exact_dedup_name: str = "ededup_wf", p1_orch_fuzzy_dedup_name: str = "fdedup_wf", p2_pipeline_runtime_pipeline_id: str = "pipeline_id", - p2_pipeline_ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": ""}, - p2_pipeline_ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image_pull_secret": ""}, + p2_pipeline_ray_head_options: dict = { + "cpu": 1, + "memory": 4, + "image_pull_secret": "", + }, + p2_pipeline_ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image_pull_secret": "", + }, p2_pipeline_server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", p2_pipeline_input_parent_path: str = "test/doc_id/input/", p2_pipeline_output_parent_path: str = "test/super/output/", p2_pipeline_parent_path_suffix: str = "", p2_pipeline_additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5, "delete_cluster_delay_minutes": 0}', p2_pipeline_data_s3_access_secret: str = "s3-secret", - p2_pipeline_runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, - p2_pipeline_runtime_actor_options: dict = {'num_cpus': 0.7}, + p2_pipeline_runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, + p2_pipeline_runtime_actor_options: dict = {"num_cpus": 0.7}, # data access. p2_pipeline_data_max_files: int = -1, p2_pipeline_data_num_samples: int = -1, @@ -134,19 +151,31 @@ def _set_component(op: dsl.BaseOp, displaied_name: str, prev_op: dsl.BaseOp = No # document ID doc_id = run_doc_id_op( - name=p1_orch_doc_id_name, prefix="p3_", params=args, host=orch_host, input_folder=p2_pipeline_input_parent_path + name=p1_orch_doc_id_name, + prefix="p3_", + params=args, + host=orch_host, + input_folder=p2_pipeline_input_parent_path, ) _set_component(doc_id, "doc ID") # exact deduplication exact_dedup = run_exact_dedup_op( - name=p1_orch_exact_dedup_name, prefix="p4_", params=args, host=orch_host, input_folder=doc_id.output + name=p1_orch_exact_dedup_name, + prefix="p4_", + params=args, + host=orch_host, + input_folder=doc_id.output, ) _set_component(exact_dedup, "exact dedup", doc_id) # fuzzy deduplication fuzzy_dedup = run_fuzzy_dedup_op( - name=p1_orch_fuzzy_dedup_name, prefix="p5_", params=args, host=orch_host, input_folder=exact_dedup.output + name=p1_orch_fuzzy_dedup_name, + prefix="p5_", + params=args, + host=orch_host, + input_folder=exact_dedup.output, ) _set_component(fuzzy_dedup, "fuzzy dedup", exact_dedup) diff --git a/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py b/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py index 434d84ab0..66d396d86 100644 --- a/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py +++ b/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py @@ -51,7 +51,11 @@ def super_pipeline( p1_pipeline_intermediate_path: str = "test/super/output/tmp", p1_pipeline_additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5, "delete_cluster_delay_minutes": 0}', p1_pipeline_data_s3_access_secret: str = "s3-secret", - p1_pipeline_runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, + p1_pipeline_runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, p1_pipeline_runtime_actor_options: dict = {"num_cpus": 0.8}, # data access p1_pipeline_data_max_files: int = -1, @@ -62,7 +66,12 @@ def super_pipeline( p2_skip: bool = False, p2_noop_sleep_sec: int = 10, p2_ray_name: str = "noop-kfp-ray", - p2_ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": "", "image": noop_image}, + p2_ray_head_options: dict = { + "cpu": 1, + "memory": 4, + "image_pull_secret": "", + "image": noop_image, + }, p2_ray_worker_options: dict = { "replicas": 2, "max_replicas": 2, @@ -75,7 +84,12 @@ def super_pipeline( # Document ID step parameters p3_name: str = "doc_id", p3_ray_name: str = "docid-kfp-ray", - p3_ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": "", "image": doc_id_image}, + p3_ray_head_options: dict = { + "cpu": 1, + "memory": 4, + "image_pull_secret": "", + "image": doc_id_image, + }, p3_ray_worker_options: dict = { "replicas": 2, "max_replicas": 2, diff --git a/examples/notebooks/html-processing/my_config.py b/examples/notebooks/html-processing/my_config.py index 7848370d1..2d337d7ab 100644 --- a/examples/notebooks/html-processing/my_config.py +++ b/examples/notebooks/html-processing/my_config.py @@ -1,35 +1,37 @@ -import os +import os + ## Configuration class MyConfig: - pass + pass + -MY_CONFIG = MyConfig () +MY_CONFIG = MyConfig() ## Crawl settings -MY_CONFIG.CRAWL_URL_BASE = 'https://thealliance.ai/' +MY_CONFIG.CRAWL_URL_BASE = "https://thealliance.ai/" # MY_CONFIG.CRAWL_URL_BASE = 'https://apache.org/' MY_CONFIG.CRAWL_MAX_DOWNLOADS = 20 MY_CONFIG.CRAWL_MAX_DEPTH = 2 -MY_CONFIG.CRAWL_MIME_TYPE = 'text/html' +MY_CONFIG.CRAWL_MIME_TYPE = "text/html" ## Directories MY_CONFIG.INPUT_DIR = "input" # MY_CONFIG.INPUT_DIR = "input2/thealliance.ai/" MY_CONFIG.OUTPUT_DIR = "output" -MY_CONFIG.OUTPUT_DIR_HTML = os.path.join(MY_CONFIG.OUTPUT_DIR , "1-html2parquet") -MY_CONFIG.OUTPUT_DIR_MARKDOWN = os.path.join(MY_CONFIG.OUTPUT_DIR , "2-markdown") +MY_CONFIG.OUTPUT_DIR_HTML = os.path.join(MY_CONFIG.OUTPUT_DIR, "1-html2parquet") +MY_CONFIG.OUTPUT_DIR_MARKDOWN = os.path.join(MY_CONFIG.OUTPUT_DIR, "2-markdown") ### ------------------------------- # MY_CONFIG.EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2' -MY_CONFIG.EMBEDDING_MODEL = 'BAAI/bge-small-en-v1.5' +MY_CONFIG.EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5" MY_CONFIG.EMBEDDING_LENGTH = 384 ### Milvus config -MY_CONFIG.DB_URI = './rag_html.db' # For embedded instance -MY_CONFIG.COLLECTION_NAME = 'docs' +MY_CONFIG.DB_URI = "./rag_html.db" # For embedded instance +MY_CONFIG.COLLECTION_NAME = "docs" ## LLM Model diff --git a/examples/notebooks/html-processing/my_utils.py b/examples/notebooks/html-processing/my_utils.py index acea186c4..dde3bcf15 100644 --- a/examples/notebooks/html-processing/my_utils.py +++ b/examples/notebooks/html-processing/my_utils.py @@ -1,28 +1,27 @@ +import glob import os +from urllib.parse import unquote + +import magic +import pandas as pd import requests +from dpk_connector.core.utils import urlparse_cached from humanfriendly import format_size -import pandas as pd -import glob -import magic -from dpk_connector.core.utils import ( - urlparse_cached -) -from urllib.parse import unquote ## Reads parquet files in a folder into a pandas dataframe -def read_parquet_files_as_df (parquet_dir): - parquet_files = glob.glob(f'{parquet_dir}/*.parquet') +def read_parquet_files_as_df(parquet_dir): + parquet_files = glob.glob(f"{parquet_dir}/*.parquet") # read each parquet file into a DataFrame and store in a list - dfs = [pd.read_parquet (f) for f in parquet_files] + dfs = [pd.read_parquet(f) for f in parquet_files] # Concatenate all DataFrames into a single DataFrame data_df = pd.concat(dfs, ignore_index=True) return data_df -def download_file(url, local_file, chunk_size=1024*1024): +def download_file(url, local_file, chunk_size=1024 * 1024): """ Downloads a remote URL to a local file. @@ -33,7 +32,7 @@ def download_file(url, local_file, chunk_size=1024*1024): Returns: None - + Example usage: download_file('http://example.com/file.txt', 'file.txt', chunk_size=1024*1024) # Download in chunks of 1MB """ @@ -49,31 +48,34 @@ def download_file(url, local_file, chunk_size=1024*1024): # Stream the file download with requests.get(url, stream=True) as r: r.raise_for_status() - with open(local_file, 'wb') as f: + with open(local_file, "wb") as f: for chunk in r.iter_content(chunk_size=chunk_size): - if chunk: # filter out keep-alive new chunks + if chunk: # filter out keep-alive new chunks f.write(chunk) print() file_size = format_size(os.path.getsize(local_file)) print(f"{local_file} ({file_size}) downloaded successfully.") + + ## --- end: download_file ------ + def get_mime_type(byte_data: bytes) -> str: """ Obtain the MIME type for provided byte data using the magic library. Args: byte_data: bytes: Bytes data to identify mimetype for. - + Returns: str: Mimetype for given bytes data. - + Example: >>> byte_data = b'\n...' >>> get_mime_type(byte_data) 'text/html' """ - + # Validate input type if not isinstance(byte_data, bytes): raise TypeError("Input must be of type 'bytes'") @@ -99,4 +101,4 @@ def get_filename_from_url(url: str) -> str: parsed = urlparse_cached(url) basename = unquote(os.path.splitext(os.path.basename(parsed.path))[0]) ext = get_extension(url) - return basename + ext \ No newline at end of file + return basename + ext diff --git a/examples/notebooks/intro/my_utils.py b/examples/notebooks/intro/my_utils.py index 9a6477dfc..c4eef5e4e 100644 --- a/examples/notebooks/intro/my_utils.py +++ b/examples/notebooks/intro/my_utils.py @@ -1,23 +1,24 @@ +import glob import os + +import pandas as pd import requests from humanfriendly import format_size -import pandas as pd -import glob ## Reads parquet files in a folder into a pandas dataframe -def read_parquet_files_as_df (parquet_dir): - parquet_files = glob.glob(f'{parquet_dir}/*.parquet') +def read_parquet_files_as_df(parquet_dir): + parquet_files = glob.glob(f"{parquet_dir}/*.parquet") # read each parquet file into a DataFrame and store in a list - dfs = [pd.read_parquet (f) for f in parquet_files] + dfs = [pd.read_parquet(f) for f in parquet_files] # Concatenate all DataFrames into a single DataFrame data_df = pd.concat(dfs, ignore_index=True) return data_df -def download_file(url, local_file, chunk_size=1024*1024): +def download_file(url, local_file, chunk_size=1024 * 1024): """ Downloads a remote URL to a local file. @@ -28,7 +29,7 @@ def download_file(url, local_file, chunk_size=1024*1024): Returns: None - + Example usage: download_file('http://example.com/file.txt', 'file.txt', chunk_size=1024*1024) # Download in chunks of 1MB """ @@ -44,12 +45,13 @@ def download_file(url, local_file, chunk_size=1024*1024): # Stream the file download with requests.get(url, stream=True) as r: r.raise_for_status() - with open(local_file, 'wb') as f: + with open(local_file, "wb") as f: for chunk in r.iter_content(chunk_size=chunk_size): - if chunk: # filter out keep-alive new chunks + if chunk: # filter out keep-alive new chunks f.write(chunk) print() file_size = format_size(os.path.getsize(local_file)) print(f"{local_file} ({file_size}) downloaded successfully.") -## --- end: download_file ------ + +## --- end: download_file ------ diff --git a/examples/notebooks/rag/my_config.py b/examples/notebooks/rag/my_config.py index 66fc1ecf7..2b558f062 100644 --- a/examples/notebooks/rag/my_config.py +++ b/examples/notebooks/rag/my_config.py @@ -1,25 +1,26 @@ -import os +import os ## Configuration class MyConfig: - pass + pass -MY_CONFIG = MyConfig () + +MY_CONFIG = MyConfig() ## Input Data - configure this to the folder we want to process MY_CONFIG.INPUT_DATA_DIR = "input" MY_CONFIG.OUTPUT_FOLDER = "output" -MY_CONFIG.OUTPUT_FOLDER_FINAL = os.path.join(MY_CONFIG.OUTPUT_FOLDER , "output_final") +MY_CONFIG.OUTPUT_FOLDER_FINAL = os.path.join(MY_CONFIG.OUTPUT_FOLDER, "output_final") ### ------------------------------- ### Milvus config -MY_CONFIG.DB_URI = './rag_1_dpk.db' # For embedded instance -MY_CONFIG.COLLECTION_NAME = 'dpk_papers' +MY_CONFIG.DB_URI = "./rag_1_dpk.db" # For embedded instance +MY_CONFIG.COLLECTION_NAME = "dpk_papers" ## Embedding model -MY_CONFIG.EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2' +MY_CONFIG.EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" MY_CONFIG.EMBEDDING_LENGTH = 384 ## LLM Model @@ -30,11 +31,11 @@ class MyConfig: ## RAY CONFIGURATION -num_cpus_available = os.cpu_count() +num_cpus_available = os.cpu_count() # print (num_cpus_available) # MY_CONFIG.RAY_NUM_CPUS = num_cpus_available // 2 ## use half the available cores for processing -MY_CONFIG.RAY_NUM_CPUS = 0.8 +MY_CONFIG.RAY_NUM_CPUS = 0.8 # print (MY_CONFIG.RAY_NUM_CPUS) MY_CONFIG.RAY_MEMORY_GB = 2 # GB # MY_CONFIG.RAY_RUNTIME_WORKERS = num_cpus_available // 3 -MY_CONFIG.RAY_RUNTIME_WORKERS = 2 \ No newline at end of file +MY_CONFIG.RAY_RUNTIME_WORKERS = 2 diff --git a/examples/notebooks/rag/utils.py b/examples/notebooks/rag/utils.py index a0474b6c9..8b087be6c 100644 --- a/examples/notebooks/rag/utils.py +++ b/examples/notebooks/rag/utils.py @@ -1,14 +1,17 @@ +import glob import os + +import pandas as pd import requests from humanfriendly import format_size -import pandas as pd -import glob + rootdir = os.path.abspath(os.path.join(__file__, "../../../../")) -## Reads parquet files in a folder into a pandas dataframe -def read_parquet_files_as_df (parquet_dir): - parquet_files = glob.glob(f'{parquet_dir}/*.parquet') + +## Reads parquet files in a folder into a pandas dataframe +def read_parquet_files_as_df(parquet_dir): + parquet_files = glob.glob(f"{parquet_dir}/*.parquet") # Create an empty list to store the DataFrames dfs = [] @@ -24,17 +27,17 @@ def read_parquet_files_as_df (parquet_dir): return data_df -def inspect_parquet (parquet_dir, sample_size = 5, display_columns = None): - - data_df = read_parquet_files_into_df (parquet_dir) +def inspect_parquet(parquet_dir, sample_size=5, display_columns=None): + + data_df = read_parquet_files_into_df(parquet_dir) if display_columns is not None: data_df = data_df[display_columns] - #print(data_df.head(sample_size)) + # print(data_df.head(sample_size)) return data_df.head(sample_size) -def download_file(url, local_file, chunk_size=1024*1024): +def download_file(url, local_file, chunk_size=1024 * 1024): """ Downloads a remote URL to a local file. @@ -45,7 +48,7 @@ def download_file(url, local_file, chunk_size=1024*1024): Returns: None - + Example usage: download_file('http://example.com/file.txt', 'file.txt', chunk_size=1024*1024) # Download in chunks of 1MB """ @@ -61,12 +64,13 @@ def download_file(url, local_file, chunk_size=1024*1024): # Stream the file download with requests.get(url, stream=True) as r: r.raise_for_status() - with open(local_file, 'wb') as f: + with open(local_file, "wb") as f: for chunk in r.iter_content(chunk_size=chunk_size): - if chunk: # filter out keep-alive new chunks + if chunk: # filter out keep-alive new chunks f.write(chunk) print() file_size = format_size(os.path.getsize(local_file)) print(f"{local_file} ({file_size}) downloaded successfully.") -## --- end: download_file ------ + +## --- end: download_file ------ diff --git a/kfp/kfp_ray_components/src/create_ray_cluster.py b/kfp/kfp_ray_components/src/create_ray_cluster.py index 131f20c8c..2d91dacbd 100644 --- a/kfp/kfp_ray_components/src/create_ray_cluster.py +++ b/kfp/kfp_ray_components/src/create_ray_cluster.py @@ -42,7 +42,11 @@ def start_ray_cluster( head_options = KFPUtils.load_from_json(ray_head_options.replace("'", '"')) worker_node = KFPUtils.load_from_json(ray_worker_options.replace("'", '"')) head_node = head_options | { - "ray_start_params": {"metrics-export-port": "8080", "num-cpus": "0", "dashboard-host": "0.0.0.0"} + "ray_start_params": { + "metrics-export-port": "8080", + "num-cpus": "0", + "dashboard-host": "0.0.0.0", + } } tolerations = os.getenv("KFP_TOLERATIONS", "") if tolerations != "": diff --git a/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py b/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py index 1f5470fe2..9cce6f896 100644 --- a/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py +++ b/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py @@ -48,7 +48,9 @@ # extra credentials prefix = args.prefix extra_access_key, extra_secret_key, extra_url = KFPUtils.credentials( - access_key=f"{prefix}_S3_KEY", secret_key=f"{prefix}_S3_SECRET", endpoint=f"{prefix}_ENDPOINT" + access_key=f"{prefix}_S3_KEY", + secret_key=f"{prefix}_S3_SECRET", + endpoint=f"{prefix}_ENDPOINT", ) exec_params[f"{prefix}_s3_cred"] = ( "{'access_key': '" diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/__init__.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/__init__.py index e13f8e8a7..78d437ebe 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/__init__.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/__init__.py @@ -1 +1,6 @@ -from workflow_support.compile_utils.component import ONE_HOUR_SEC, ONE_DAY_SEC, ONE_WEEK_SEC, ComponentUtils +from workflow_support.compile_utils.component import ( + ONE_HOUR_SEC, + ONE_DAY_SEC, + ONE_WEEK_SEC, + ComponentUtils, +) diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/component.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/component.py index 1b1c8ba24..63208ef96 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/component.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/component.py @@ -106,7 +106,11 @@ def _add_node_selector() -> None: def set_s3_env_vars_to_component( component: dsl.ContainerOp, secret: str, - env2key: dict[str, str] = {"S3_KEY": "s3-key", "S3_SECRET": "s3-secret", "ENDPOINT": "s3-endpoint"}, + env2key: dict[str, str] = { + "S3_KEY": "s3-key", + "S3_SECRET": "s3-secret", + "ENDPOINT": "s3-endpoint", + }, prefix: str = None, ) -> None: """ diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipeline_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipeline_utils.py index 3bc660c19..e89537507 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipeline_utils.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipeline_utils.py @@ -61,7 +61,9 @@ def upload_pipeline( return None try: pipeline = self.kfp_client.upload_pipeline( - pipeline_package_path=pipeline_package_path, pipeline_name=pipeline_name, description=description + pipeline_package_path=pipeline_package_path, + pipeline_name=pipeline_name, + description=description, ) except Exception as e: logger.warning(f"Exception uploading pipeline {e}") @@ -100,7 +102,10 @@ def start_pipeline( job_name = pipeline.name + " " + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") try: run_id = self.kfp_client.run_pipeline( - experiment_id=experiment.id, job_name=job_name, pipeline_id=pipeline.id, params=params + experiment_id=experiment.id, + job_name=job_name, + pipeline_id=pipeline.id, + params=params, ) logger.info(f"Pipeline run {job_name} submitted") return run_id.id @@ -156,7 +161,13 @@ def wait_pipeline_completion(self, run_id: str, timeout: int = -1, wait: int = 6 end = 2**63 - 1 run_details = self.kfp_client.get_run(run_id=run_id) status = run_details.run.status - while status is None or status.lower() not in ["succeeded", "completed", "failed", "skipped", "error"]: + while status is None or status.lower() not in [ + "succeeded", + "completed", + "failed", + "skipped", + "error", + ]: time.sleep(wait) if (end - time.time()) < 0: return "failed", f"Execution is taking too long" diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py index f60e1196e..de7df9d13 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py @@ -20,7 +20,11 @@ logger = get_logger(__name__) -def run_test(pipeline_package_path: str, endpoint: str = "http://localhost:8080/", overwrite: bool = True): +def run_test( + pipeline_package_path: str, + endpoint: str = "http://localhost:8080/", + overwrite: bool = True, +): """ Upload and run a single pipeline diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/__init__.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/__init__.py index e13f8e8a7..78d437ebe 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/__init__.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/__init__.py @@ -1 +1,6 @@ -from workflow_support.compile_utils.component import ONE_HOUR_SEC, ONE_DAY_SEC, ONE_WEEK_SEC, ComponentUtils +from workflow_support.compile_utils.component import ( + ONE_HOUR_SEC, + ONE_DAY_SEC, + ONE_WEEK_SEC, + ComponentUtils, +) diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py index 28f36acf7..8505d7d07 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py @@ -86,7 +86,9 @@ def _add_node_selector() -> None: logger.warning(f"Exception while handling node_selector {e}") kubernetes.use_field_path_as_env( - task, env_name=RUN_NAME, field_path="metadata.annotations['pipelines.kubeflow.org/run_name']" + task, + env_name=RUN_NAME, + field_path="metadata.annotations['pipelines.kubeflow.org/run_name']", ) # Set cashing task.set_caching_options(enable_caching=cache_strategy) @@ -114,7 +116,11 @@ def set_s3_env_vars_to_component( :param prefix: prefix to add to env name """ if env2key is None: - env2key = {"s3-key": "S3_KEY", "s3-secret": "S3_SECRET", "s3-endpoint": "ENDPOINT"} + env2key = { + "s3-key": "S3_KEY", + "s3-secret": "S3_SECRET", + "s3-endpoint": "ENDPOINT", + } if prefix is not None: for secret_key, _ in env2key.items(): diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipeline_utils.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipeline_utils.py index 7403d01a6..f6d93c8f4 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipeline_utils.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipeline_utils.py @@ -64,7 +64,9 @@ def upload_pipeline( return None try: pipeline = self.kfp_client.upload_pipeline( - pipeline_package_path=pipeline_package_path, pipeline_name=pipeline_name, description=description + pipeline_package_path=pipeline_package_path, + pipeline_name=pipeline_name, + description=description, ) except Exception as e: logger.warning(f"Exception uploading pipeline {e}") diff --git a/kfp/kfp_support_lib/shared_workflow_support/src/python_apiserver_client/kuberay_apis.py b/kfp/kfp_support_lib/shared_workflow_support/src/python_apiserver_client/kuberay_apis.py index 9051e7c73..f66689cf4 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/src/python_apiserver_client/kuberay_apis.py +++ b/kfp/kfp_support_lib/shared_workflow_support/src/python_apiserver_client/kuberay_apis.py @@ -81,7 +81,11 @@ def list_compute_templates(self) -> tuple[int, str, list[Template]]: try: response = requests.get(url, headers=_headers, timeout=TIMEOUT) if response.status_code // 100 == 2: - return response.status_code, None, templates_decoder(response.json()) + return ( + response.status_code, + None, + templates_decoder(response.json()), + ) else: logger.warning(f"Failed to list compute templates, status : {response.status_code}") status = response.status_code @@ -110,7 +114,11 @@ def list_compute_templates_namespace(self, ns: str) -> tuple[int, str, list[Temp try: response = requests.get(url, headers=_headers, timeout=TIMEOUT) if response.status_code // 100 == 2: - return response.status_code, None, templates_decoder(response.json()) + return ( + response.status_code, + None, + templates_decoder(response.json()), + ) else: logger.warning( f"Failed to list compute templates for namespace {ns}, status : {response.status_code}" @@ -394,7 +402,11 @@ def get_cluster_endpoints(self, ns: str, name: str, wait: int = -1) -> tuple[int status, error, cluster = self.get_cluster(ns=ns, name=name) if status // 100 != 2: return status, error, None - return status, None, f"{name}-head-svc.{ns}.svc.cluster.local:{cluster.service_endpoint['dashboard']}" + return ( + status, + None, + f"{name}-head-svc.{ns}.svc.cluster.local:{cluster.service_endpoint['dashboard']}", + ) def delete_cluster(self, ns: str, name: str) -> tuple[int, str]: """ @@ -518,7 +530,11 @@ def list_job_info(self, ns: str, name: str) -> tuple[int, str, list[RayJobInfo]] if job_info_array is None: return response.status_code, None, [] else: - return response.status_code, None, [RayJobInfo(i) for i in job_info_array] + return ( + response.status_code, + None, + [RayJobInfo(i) for i in job_info_array], + ) else: logger.warning( f"Failed to list jobs from the cluster {name} in namespace {ns}, " diff --git a/kfp/kfp_support_lib/shared_workflow_support/src/python_apiserver_client/params/environmentvariables.py b/kfp/kfp_support_lib/shared_workflow_support/src/python_apiserver_client/params/environmentvariables.py index d1056f6f6..13490557a 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/src/python_apiserver_client/params/environmentvariables.py +++ b/kfp/kfp_support_lib/shared_workflow_support/src/python_apiserver_client/params/environmentvariables.py @@ -137,7 +137,11 @@ def env_var_from_decoder(dct: dict[str, Any]) -> EnvVarFrom: :param dct: dictionary representations of environment from :return: environment from """ - return EnvVarFrom(name=dct.get("name", ""), source=EnvVarSource(int(dct.get("source", 0))), key=dct.get("key", "")) + return EnvVarFrom( + name=dct.get("name", ""), + source=EnvVarSource(int(dct.get("source", 0))), + key=dct.get("key", ""), + ) def environment_variables_decoder(dct: dict[str, Any]) -> EnvironmentVariables: diff --git a/kfp/kfp_support_lib/shared_workflow_support/src/python_apiserver_client/params/headnode.py b/kfp/kfp_support_lib/shared_workflow_support/src/python_apiserver_client/params/headnode.py index 37c2e2572..a1638f4ed 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/src/python_apiserver_client/params/headnode.py +++ b/kfp/kfp_support_lib/shared_workflow_support/src/python_apiserver_client/params/headnode.py @@ -21,7 +21,11 @@ ) -DEFAULT_HEAD_START_PARAMS = {"dashboard-host": "0.0.0.0", "metrics-export-port": "8080", "num-cpus": "0"} +DEFAULT_HEAD_START_PARAMS = { + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + "num-cpus": "0", +} class ServiceType(enum.Enum): @@ -141,7 +145,10 @@ def to_dict(self) -> dict[str, Any]: Convert to dictionary :return: dictionary representation of the head node """ - dct = {"computeTemplate": self.compute_template, "rayStartParams": self.ray_start_params} + dct = { + "computeTemplate": self.compute_template, + "rayStartParams": self.ray_start_params, + } if self.image is not None: dct["image"] = self.image if self.service_type is not None: diff --git a/kfp/kfp_support_lib/shared_workflow_support/src/python_apiserver_client/params/templates.py b/kfp/kfp_support_lib/shared_workflow_support/src/python_apiserver_client/params/templates.py index 0ef4c1583..55b978efd 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/src/python_apiserver_client/params/templates.py +++ b/kfp/kfp_support_lib/shared_workflow_support/src/python_apiserver_client/params/templates.py @@ -48,7 +48,13 @@ class Toleration: - to_dict() -> dict[str, Any] convert to dict """ - def __init__(self, key: str, operator: TolerationOperation, effect: TolerationEffect, value: str = None): + def __init__( + self, + key: str, + operator: TolerationOperation, + effect: TolerationEffect, + value: str = None, + ): """ Initialization :param key: key @@ -77,7 +83,11 @@ def to_dict(self) -> dict[str, Any]: Convert to string :return: string representation of toleration """ - dct = {"key": self.key, "operator": self.operator.value, "effect": self.effect.value} + dct = { + "key": self.key, + "operator": self.operator.value, + "effect": self.effect.value, + } if self.value is not None: dct["value"] = self.value return dct @@ -165,7 +175,12 @@ def to_dict(self) -> dict[str, Any]: Convert to dictionary :return: dictionary representation of template """ - dct = {"name": self.name, "namespace": self.namespace, "cpu": self.cpu, "memory": self.memory} + dct = { + "name": self.name, + "namespace": self.namespace, + "cpu": self.cpu, + "memory": self.memory, + } if self.gpu > 0: dct["gpu"] = self.gpu if self.gpu_accelerator is not None: diff --git a/kfp/kfp_support_lib/shared_workflow_support/src/python_apiserver_client/params/volumes.py b/kfp/kfp_support_lib/shared_workflow_support/src/python_apiserver_client/params/volumes.py index fee0e1ea4..c053513f6 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/src/python_apiserver_client/params/volumes.py +++ b/kfp/kfp_support_lib/shared_workflow_support/src/python_apiserver_client/params/volumes.py @@ -115,7 +115,12 @@ def to_dict(self) -> dict[str, Any]: Convert to dictionary :return: HostPathVolume dictionary representation """ - dst = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} + dst = { + "name": self.name, + "mountPath": self.mount_path, + "source": self.source, + "volumeType": self.volume_type, + } if self.mount_propagation is not None: dst["mountPropagationMode"] = self.mount_propagation.value if self.host_path_type is not None: @@ -172,7 +177,12 @@ def to_dict(self) -> dict[str, Any]: Convert to dictionary :return: PVCVolume dictionary representation """ - dst = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} + dst = { + "name": self.name, + "mountPath": self.mount_path, + "source": self.source, + "volumeType": self.volume_type, + } if self.readonly: dst["readOnly"] = True if self.mount_propagation is not None: @@ -282,7 +292,11 @@ def to_string(self) -> str: return val def to_dict(self) -> dict[str, Any]: - dct = {"name": self.name, "mountPath": self.mount_path, "volumeType": self.volume_type} + dct = { + "name": self.name, + "mountPath": self.mount_path, + "volumeType": self.volume_type, + } if self.storage is not None: dct["storage"] = self.storage return dct @@ -333,7 +347,12 @@ def to_dict(self) -> dict[str, Any]: Convert to dictionary :return: ConfigMapVolume dictionary representation """ - dct = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} + dct = { + "name": self.name, + "mountPath": self.mount_path, + "source": self.source, + "volumeType": self.volume_type, + } if self.items is not None: dct["items"] = self.items return dct @@ -367,7 +386,12 @@ def to_string(self) -> str: return val def to_dict(self) -> dict[str, Any]: - dct = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} + dct = { + "name": self.name, + "mountPath": self.mount_path, + "source": self.source, + "volumeType": self.volume_type, + } if self.items is not None: dct["items"] = self.items return dct @@ -443,7 +467,9 @@ def _get_access_mode() -> AccessMode: case 5: # Empty dir volume return EmptyDirVolume( - name=dst.get("name", ""), mount_path=dst.get("mountPath", ""), storage=dst.get("storage") + name=dst.get("name", ""), + mount_path=dst.get("mountPath", ""), + storage=dst.get("storage"), ) case _: raise Exception(f"Unknown volume type in {dst}") diff --git a/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/kfp_utils.py b/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/kfp_utils.py index 7fa76453f..c531c8e30 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/kfp_utils.py +++ b/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/kfp_utils.py @@ -29,7 +29,9 @@ class KFPUtils: @staticmethod def credentials( - access_key: str = "S3_KEY", secret_key: str = "S3_SECRET", endpoint: str = "ENDPOINT" + access_key: str = "S3_KEY", + secret_key: str = "S3_SECRET", + endpoint: str = "ENDPOINT", ) -> tuple[str, str, str]: """ Get credentials from the environment diff --git a/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/remote_jobs_utils.py b/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/remote_jobs_utils.py index 007345dfd..e63954b61 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/remote_jobs_utils.py +++ b/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/remote_jobs_utils.py @@ -61,7 +61,9 @@ def __init__( :param http_retries: http retries """ self.api_server_client = KubeRayAPIs( - server_url=server_url, http_retries=http_retries, wait_interval=wait_interval + server_url=server_url, + http_retries=http_retries, + wait_interval=wait_interval, ) self.default_image = default_image @@ -233,7 +235,13 @@ def create_ray_cluster( # Build cluster spec cluster_spec = ClusterSpec(head_node=head_node_spec, worker_groups=worker_groups) # Build cluster - cluster = Cluster(name=name, namespace=namespace, user="dataprep", version="2.9.3", cluster_spec=cluster_spec) + cluster = Cluster( + name=name, + namespace=namespace, + user="dataprep", + version="2.9.3", + cluster_spec=cluster_spec, + ) status, error = self.api_server_client.create_cluster(cluster) if status != 200: return status, error @@ -352,7 +360,12 @@ def follow_execution( ) if status // 100 != 2: sys.exit(1) - if job_status in {JobStatus.STOPPED, JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.RUNNING}: + if job_status in { + JobStatus.STOPPED, + JobStatus.SUCCEEDED, + JobStatus.FAILED, + JobStatus.RUNNING, + }: break time.sleep(self.api_server_client.wait_interval) job_ready_timeout -= self.api_server_client.wait_interval @@ -491,7 +504,10 @@ def execute_ray_jobs( name=name, ns=ns, script=exec_script_name, - data_access_params={f"{cli_prefix}s3_config": config_value, f"{cli_prefix}s3_cred": s3_creds}, + data_access_params={ + f"{cli_prefix}s3_config": config_value, + f"{cli_prefix}s3_cred": s3_creds, + }, params=e_params, additional_params=additional_params, remote_jobs=remote_jobs, @@ -509,7 +525,10 @@ def execute_ray_jobs( name=name, ns=ns, script=exec_script_name, - data_access_params={f"{cli_prefix}s3_config": conf, f"{cli_prefix}s3_cred": s3_creds}, + data_access_params={ + f"{cli_prefix}s3_config": conf, + f"{cli_prefix}s3_cred": s3_creds, + }, params=launch_params, additional_params=additional_params, remote_jobs=remote_jobs, diff --git a/kfp/kfp_support_lib/shared_workflow_support/test/api_params_test.py b/kfp/kfp_support_lib/shared_workflow_support/test/api_params_test.py index 53740c939..6485ae110 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/test/api_params_test.py +++ b/kfp/kfp_support_lib/shared_workflow_support/test/api_params_test.py @@ -56,13 +56,20 @@ def test_toleration(): - tol1 = Toleration(key="blah1", operator=TolerationOperation.Exists, effect=TolerationEffect.NoExecute) + tol1 = Toleration( + key="blah1", + operator=TolerationOperation.Exists, + effect=TolerationEffect.NoExecute, + ) print(f"\ntoleration 1: {tol1.to_string()}") t1_json = json.dumps(tol1.to_dict()) print(f"toleration 1 JSON: {t1_json}") tol2 = Toleration( - key="blah2", operator=TolerationOperation.Exists, effect=TolerationEffect.NoExecute, value="value" + key="blah2", + operator=TolerationOperation.Exists, + effect=TolerationEffect.NoExecute, + value="value", ) print(f"toleration 2: {tol2.to_string()}") t2_json = json.dumps(tol2.to_dict()) @@ -74,12 +81,25 @@ def test_toleration(): def test_templates(): - tol1 = Toleration(key="blah1", operator=TolerationOperation.Exists, effect=TolerationEffect.NoExecute) + tol1 = Toleration( + key="blah1", + operator=TolerationOperation.Exists, + effect=TolerationEffect.NoExecute, + ) tol2 = Toleration( - key="blah2", operator=TolerationOperation.Exists, effect=TolerationEffect.NoExecute, value="value" + key="blah2", + operator=TolerationOperation.Exists, + effect=TolerationEffect.NoExecute, + value="value", ) - temp1 = Template(name="template1", namespace="namespace", cpu=1, memory=4, tolerations=[tol1, tol2]) + temp1 = Template( + name="template1", + namespace="namespace", + cpu=1, + memory=4, + tolerations=[tol1, tol2], + ) print(f"\ntemplate 1: {temp1.to_string()}") tm1_json = json.dumps(temp1.to_dict()) print(f"template 1 JSON: {tm1_json}") @@ -121,7 +141,11 @@ def test_volumes(): assert volume_decoder(json.loads(vol_json)).to_string() == vol.to_string() vol = EphemeralVolume( - name="ephemeral", mount_path="tmp/ephemeral", storage="5Gi", storage_class="blah", access_mode=AccessMode.RWX + name="ephemeral", + mount_path="tmp/ephemeral", + storage="5Gi", + storage_class="blah", + access_mode=AccessMode.RWX, ) print(f"Ephemeral volume: {vol.to_string()}") vol_json = json.dumps(vol.to_dict()) @@ -135,7 +159,10 @@ def test_volumes(): assert volume_decoder(json.loads(vol_json)).to_string() == vol.to_string() vol = ConfigMapVolume( - name="confmap", mount_path="tmp/confmap", source="my-map", items={"sample_code.py": "sample_code.py"} + name="confmap", + mount_path="tmp/confmap", + source="my-map", + items={"sample_code.py": "sample_code.py"}, ) print(f"config map volume: {vol.to_string()}") vol_json = json.dumps(vol.to_dict()) @@ -409,7 +436,11 @@ def test_submission(): env_vars: counter_name: test_counter """ - request = RayJobRequest(entrypoint="python /home/ray/samples/sample_code.py", runtime_env=yaml, num_cpu=0.5) + request = RayJobRequest( + entrypoint="python /home/ray/samples/sample_code.py", + runtime_env=yaml, + num_cpu=0.5, + ) print(f"job request: {request.to_string()}") request_json = json.dumps(request.to_dict()) print(f"request JSON: {request_json}") diff --git a/kfp/kfp_support_lib/shared_workflow_support/test/kuberay_api_test.py b/kfp/kfp_support_lib/shared_workflow_support/test/kuberay_api_test.py index a74187697..69c7752cf 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/test/kuberay_api_test.py +++ b/kfp/kfp_support_lib/shared_workflow_support/test/kuberay_api_test.py @@ -43,8 +43,18 @@ def test_templates(): # cleanup _, _ = apis.delete_compute_template(ns="default", name="default-template") # create - toleration = Toleration(key="blah1", operator=TolerationOperation.Exists, effect=TolerationEffect.NoExecute) - template = Template(name="default-template", namespace="default", cpu=2, memory=8, tolerations=[toleration]) + toleration = Toleration( + key="blah1", + operator=TolerationOperation.Exists, + effect=TolerationEffect.NoExecute, + ) + template = Template( + name="default-template", + namespace="default", + cpu=2, + memory=8, + tolerations=[toleration], + ) status, error = apis.create_compute_template(template) assert status == 200 assert error is None @@ -250,7 +260,9 @@ def test_job_submission(): counter_name: test_counter """ job_request = RayJobRequest( - entrypoint="python /home/ray/samples/sample_code.py", runtime_env=resource_yaml, num_cpu=0.5 + entrypoint="python /home/ray/samples/sample_code.py", + runtime_env=resource_yaml, + num_cpu=0.5, ) # To ensure that Ray cluster HTTP is ready try to get jobs info from the cluster status, error, job_info_array = apis.list_job_info(ns="default", name="test-job") diff --git a/kfp/kfp_support_lib/shared_workflow_support/test/ray_remote_jobs_test.py b/kfp/kfp_support_lib/shared_workflow_support/test/ray_remote_jobs_test.py index bc6a1f7b9..8170edf12 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/test/ray_remote_jobs_test.py +++ b/kfp/kfp_support_lib/shared_workflow_support/test/ray_remote_jobs_test.py @@ -39,7 +39,11 @@ def test_ray_remote_jobs(): "memory": 4, "image": "rayproject/ray:2.9.3-py310", # Ray start params, just to show - "ray_start_params": {"metrics-export-port": "8080", "num-cpus": "0", "dashboard-host": "0.0.0.0"}, + "ray_start_params": { + "metrics-export-port": "8080", + "num-cpus": "0", + "dashboard-host": "0.0.0.0", + }, "image_pull_policy": "Always", } | dct_volumes @@ -61,7 +65,10 @@ def test_ray_remote_jobs(): # create cluster remote_jobs = RayRemoteJobs(server_url=server_url) status, error = remote_jobs.create_ray_cluster( - name="job-test", namespace="default", head_node=head_node, worker_nodes=[worker_node] + name="job-test", + namespace="default", + head_node=head_node, + worker_nodes=[worker_node], ) print(f"Created cluster - status: {status}, error: {error}") assert status == 200 diff --git a/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py b/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py index e5d5f7688..cc8c31fb6 100644 --- a/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py +++ b/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py @@ -14,7 +14,6 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl - from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils diff --git a/kfp/pipeline_generator/superpipeline/generated/sample-super-kubeflow-pipeline_wf.py b/kfp/pipeline_generator/superpipeline/generated/sample-super-kubeflow-pipeline_wf.py index 92f97763d..8561edfcf 100644 --- a/kfp/pipeline_generator/superpipeline/generated/sample-super-kubeflow-pipeline_wf.py +++ b/kfp/pipeline_generator/superpipeline/generated/sample-super-kubeflow-pipeline_wf.py @@ -14,6 +14,7 @@ doc_id_image = "quay.io/dataprep1/data-prep-kit/doc_id-ray:latest`" ededup_image = "quay.io/dataprep1/data-prep-kit/ededup-ray:latest" + # Pipeline to invoke execution on remote resource @dsl.pipeline( name="sample-super-kubeflow-pipeline", @@ -79,11 +80,19 @@ def _set_component(op: dsl.BaseOp, displaied_name: str, prev_op: dsl.BaseOp = No op.after(prev_op) doc_id = run_doc_id_op( - name=p1_orch_doc_id_name, prefix="p3_", params=args, host=orch_host, input_folder=p2_pipeline_input_parent_path + name=p1_orch_doc_id_name, + prefix="p3_", + params=args, + host=orch_host, + input_folder=p2_pipeline_input_parent_path, ) _set_component(doc_id, "doc_id") ededup = run_ededup_op( - name=p1_orch_ededup_name, prefix="p4_", params=args, host=orch_host, input_folder=doc_id.output + name=p1_orch_ededup_name, + prefix="p4_", + params=args, + host=orch_host, + input_folder=doc_id.output, ) _set_component(ededup, "ededup", doc_id) diff --git a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py index f3f491e4b..a978864da 100644 --- a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py +++ b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py @@ -30,6 +30,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -109,7 +110,14 @@ def code2parquet( ray_name: str = "code2parquet-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access data_s3_config: str = "{'input_folder': 'test/code2parquet/input', 'output_folder': 'test/code2parquet/output/'}", @@ -118,9 +126,13 @@ def code2parquet( data_num_samples: int = -1, data_files_to_use: str = "['.zip']", # orchestrator - runtime_actor_options: dict = {'num_cpus': 0.8}, + runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # code to parquet code2parquet_supported_langs_file: str = "test/code2parquet/languages/lang_extensions.json", code2parquet_detect_programming_lang: bool = True, @@ -173,7 +185,12 @@ def code2parquet( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): diff --git a/transforms/code/code2parquet/python/src/code2parquet_transform.py b/transforms/code/code2parquet/python/src/code2parquet_transform.py index b00f9ff05..db77b58cc 100644 --- a/transforms/code/code2parquet/python/src/code2parquet_transform.py +++ b/transforms/code/code2parquet/python/src/code2parquet_transform.py @@ -186,7 +186,10 @@ def add_input_params(self, parser: ArgumentParser) -> None: help="Infer the programming lang from the file extension using the file of supported languages", ) parser.add_argument( - f"--{snapshot_cli_key}", type=str, help="Snapshot value assigned to all imported documents.", default=None + f"--{snapshot_cli_key}", + type=str, + help="Snapshot value assigned to all imported documents.", + default=None, ) parser.add_argument( f"--{domain_cli_key}", diff --git a/transforms/code/code2parquet/python/test/test_code2parquet.py b/transforms/code/code2parquet/python/test/test_code2parquet.py index 22524264b..caa4d4eb4 100644 --- a/transforms/code/code2parquet/python/test/test_code2parquet.py +++ b/transforms/code/code2parquet/python/test/test_code2parquet.py @@ -38,7 +38,12 @@ def get_test_transform_fixtures(self) -> list[tuple]: input_dir = os.path.join(basedir, "input") input_files = get_files_in_folder(input_dir, ".zip") input_files = [(name, binary) for name, binary in input_files.items()] - expected_metadata_list = [{"number of rows": 2}, {"number of rows": 20}, {"number of rows": 52}, {}] + expected_metadata_list = [ + {"number of rows": 2}, + {"number of rows": 20}, + {"number of rows": 52}, + {}, + ] config = { supported_langs_file_key: lang_supported_file, detect_programming_lang_key: True, @@ -50,7 +55,14 @@ def get_test_transform_fixtures(self) -> list[tuple]: (binary, TransformUtils.get_file_extension(name)[1]) for name, binary in expected_files.items() ] - return [(CodeToParquetTransform(config), input_files, expected_files, expected_metadata_list)] + return [ + ( + CodeToParquetTransform(config), + input_files, + expected_files, + expected_metadata_list, + ) + ] if __name__ == "__main__": diff --git a/transforms/code/code_profiler/python/.dockerignore b/transforms/code/code_profiler/.dockerignore similarity index 100% rename from transforms/code/code_profiler/python/.dockerignore rename to transforms/code/code_profiler/.dockerignore diff --git a/transforms/code/code_profiler/.make.subdirs b/transforms/code/code_profiler/.make.subdirs new file mode 100644 index 000000000..28294b5c0 --- /dev/null +++ b/transforms/code/code_profiler/.make.subdirs @@ -0,0 +1,2 @@ +python +ray \ No newline at end of file diff --git a/transforms/code/code_profiler/python/Dockerfile b/transforms/code/code_profiler/Dockerfile.python similarity index 54% rename from transforms/code/code_profiler/python/Dockerfile rename to transforms/code/code_profiler/Dockerfile.python index 297a24e4a..60fb998cd 100644 --- a/transforms/code/code_profiler/python/Dockerfile +++ b/transforms/code/code_profiler/Dockerfile.python @@ -9,32 +9,19 @@ RUN pip install --no-cache-dir pytest RUN useradd -ms /bin/bash dpk USER dpk WORKDIR /home/dpk + ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chown=dpk:root data-processing-dist data-processing-dist +COPY --chown=dpk:root data-processing-dist/ data-processing-dist/ RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME} -COPY --chown=dpk:root src/ src/ -COPY --chown=dpk:root pyproject.toml pyproject.toml -COPY --chown=dpk:root README.md README.md -COPY --chown=dpk:root requirements.txt requirements.txt - -RUN pip install --no-cache-dir -e . - -# copy the main() entry point to the image -COPY ./src/code_profiler_transform_python.py . +# END OF STEPS destined for a data-prep-kit base image -# copy some of the samples in -COPY ./src/code_profiler_local.py local/ - -# Copy the tree-sitter bindings (this is the important part) -COPY --chown=dpk:root ../../input/tree-sitter-bindings/ /home/dpk/input/tree-sitter-bindings/ - -# copy test -# COPY test/ test/ -# COPY test-data/ test-data/ +COPY --chown=dpk:root dpk_code_profiler/ dpk_code_profiler/ +COPY --chown=dpk:root requirements.txt requirements.txt +RUN pip install -r requirements.txt # Set environment ENV PYTHONPATH /home/dpk diff --git a/transforms/code/code_profiler/ray/Dockerfile b/transforms/code/code_profiler/Dockerfile.ray similarity index 57% rename from transforms/code/code_profiler/ray/Dockerfile rename to transforms/code/code_profiler/Dockerfile.ray index a5439f005..6b2e4695e 100644 --- a/transforms/code/code_profiler/ray/Dockerfile +++ b/transforms/code/code_profiler/Dockerfile.ray @@ -1,11 +1,11 @@ ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310 + FROM ${BASE_IMAGE} RUN pip install --upgrade --no-cache-dir pip # install pytest RUN pip install --no-cache-dir pytest - ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries @@ -13,25 +13,10 @@ ARG DPK_WHEEL_FILE_NAME COPY --chown=ray:users data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chown=ray:users python-transform/ python-transform/ -RUN cd python-transform && pip install --no-cache-dir -e . - -#COPY requirements.txt requirements.txt -#RUN pip install --no-cache-dir -r requirements.txt - -COPY --chown=ray:users src/ src/ -COPY --chown=ray:users pyproject.toml pyproject.toml -RUN pip install --no-cache-dir -e . - -# copy the main() entry point to the image -COPY ./src/code_profiler_transform_ray.py . - -# copy some of the samples in -COPY ./src/code_profiler_local_ray.py local/ - -# copy test -COPY test/ test/ -COPY test-data/ test-data/ +## Copy the python version of the tansform +COPY --chown=ray:users dpk_code_profiler/ dpk_code_profiler/ +COPY --chown=ray:users requirements.txt requirements.txt +RUN pip install -r requirements.txt # Grant non-root users the necessary permissions to the ray directory RUN chmod 755 /home/ray diff --git a/transforms/code/code_profiler/Makefile b/transforms/code/code_profiler/Makefile index bace5e172..26bb94824 100644 --- a/transforms/code/code_profiler/Makefile +++ b/transforms/code/code_profiler/Makefile @@ -1,49 +1,32 @@ REPOROOT=../../.. # Use make help, to see the available rules -include $(REPOROOT)/.make.defaults - -setup:: - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse - -clean:: - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse - -build:: - @# Help: Recursively make $@ in subdirs - $(MAKE) RULE=$@ .recurse -venv:: - @# Help: Recursively make $@ in subdirs - $(MAKE) RULE=$@ .recurse - -publish:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test-image: - @echo "Skipping test-image step as per configuration." - -test:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test-src:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -set-versions: - @# Help: Recursively $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -.PHONY: workflow-venv -workflow-venv: - -.PHONY: workflow-test -workflow-test: - -.PHONY: workflow-upload -workflow-upload: - -.PHONY: workflow-build -workflow-build: +include $(REPOROOT)/transforms/.make.cicd.targets + +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=$(shell basename `pwd`) + +################################################################################ +export RUNTIME_HOST_ARCH=x86_64 + +run-cli-sample: + make venv + source venv/bin/activate && \ + $(PYTHON) -m dpk_$(TRANSFORM_NAME).transform_python \ + --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" \ + --language "language" \ + --contents "contents" + +run-ray-cli-sample: + make venv + source venv/bin/activate && \ + $(PYTHON) -m dpk_$(TRANSFORM_NAME).ray.transform \ + --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" \ + --run_locally True \ + --language "language" \ + --contents "contents" \ No newline at end of file diff --git a/transforms/code/code_profiler/README.md b/transforms/code/code_profiler/README.md index 84153493e..564f04e10 100644 --- a/transforms/code/code_profiler/README.md +++ b/transforms/code/code_profiler/README.md @@ -39,13 +39,6 @@ As shown in Table 2, the framework standardizes code representation by categoriz | | **Ocaml** | Yes | NA | Yes | -* [python](python/README.md) - provides the base python-based syntactic concept extractor -implementation. -* [ray](ray/README.md) - provides the base ray-based syntactic concept extractor -implementation. - - - **Offline Path for Syntactic Rule Generation** The offline path is critical for expanding and refining the syntactic rule database, enabling the UBSR framework to adapt to new languages and syntactic constructs. This process leverages LLMs to generate syntactic rules for languages that are not yet included in the rule database. To achieve this, we utilize a Few-shot Chain of Thought prompting technique, guiding the LLM through a step-by-step rule generation process. By providing carefully curated training exemplars and detailed instructions, this method ensures the LLM can accurately generalize from these examples to produce effective syntactic rules for a wide range of languages. This structured approach enhances the flexibility of the UBSR framework, allowing it to seamlessly handle evolving language constructs. @@ -62,12 +55,104 @@ For each new target language, the offline phase is utilized to create determinis In the online phase, the system dynamically generates profiling outputs for any incoming code snippets. This is achieved by extracting concepts from the snippets using the rules in the database and storing these extractions in a tabular format. The structured tabular format allows for generating additional concept columns, which are then utilized to create comprehensive profiling reports. -The following runtimes are available: -* [python](python/README.md) - provides the base python-based transformation -implementation and python runtime. -* [ray](ray/README.md) - enables the running of the base python transformation -in a Ray runtime -Please refer to the playbook at `transforms/code/code_profiler/notebook_example/code-profiler.ipynb` to run the pythonic code profiler +## Configuration and command line Options + +The set of dictionary keys holding [code_profiler_transform](dpk_code_profiler/transform.py) +configuration for values are as follows: + +* content - specifies the column name in the dataframe that has the code snippet +* language - specifies the programming languages of the code snippet + +## Running + +### Running the samples + +The code profiler can be run on mach-arm64 and x86_64 host architecture. +Depending on your host architecture, please change the `RUNTIME_HOST_ARCH` in the Makefile. +``` +# values possible mach-arm64, x86_64 +export RUNTIME_HOST_ARCH=x86_64 +``` +If you are using mac, you may need to permit your Mac to load the .so from the security settings. Generally, you get the pop-up under the tab security while running the transform. + +![alt text](image.png) + +To run the samples, use the following `make` target + +* `run-cli-sample` - runs dpk_code_profiler/transform.py using command line args + +This target will activate the virtual environment and sets up any configuration needed. +Use the `-n` option of `make` to see the detail of what is done to run the sample. + +For example, +```shell +make run-cli-sample +... +``` +Then +```shell +ls output +``` +To see results of the transform. + + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + +## Testing + +Following [the testing strategy of data-processing-lib](../../../data-processing-lib/doc/transform-testing.md) + +Currently we have: +- [Unit test](test/test_code_profiler_python.py) +- [Integration test](test/test_code_profiler.py) + + +## Document Quality Ray Transform +Please see the set of +[transform project conventions](../../README.md#transform-project-conventions) +for details on general project conventions, transform configuration, +testing and IDE set up. + + +### Configuration and command line Options + +Document Quality configuration and command line options are the same as for the base python transform. + +### Running + +#### Launched Command Line Options +When running the transform with the Ray launcher (i.e., TransformLauncher), +In addition to those available to the transform as defined here, +the set of +[ray launcher](../../../data-processing-lib/doc/ray-launcher-options.md) are available. + +#### Running the samples +To run the samples, use the following `make` target + +* `run-ray-cli-sample` - runs dpk_code_profiler/ray/transform.py using command line args + +This target will activate the virtual environment and sets up any configuration needed. +Use the `-n` option of `make` to see the detail of what is done to run the sample. + +For example, +```shell +make run-ray-cli-sample +... +``` +Then +```shell +ls output +``` +To see results of the transform. + +#### Transforming data using the transform image +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. \ No newline at end of file diff --git a/transforms/code/code_profiler/python/src/UAST.py b/transforms/code/code_profiler/dpk_code_profiler/UAST.py similarity index 68% rename from transforms/code/code_profiler/python/src/UAST.py rename to transforms/code/code_profiler/dpk_code_profiler/UAST.py index 6406a7aee..c59d0d243 100644 --- a/transforms/code/code_profiler/python/src/UAST.py +++ b/transforms/code/code_profiler/dpk_code_profiler/UAST.py @@ -11,8 +11,10 @@ ################################################################################ import json -import networkx + import matplotlib.pyplot as plt +import networkx + class UASTNode: """ @@ -29,16 +31,18 @@ class UASTNode: end_point (tuple(int, int)): The end line number and byte of the node. """ - def __init__(self, - id: int = 0, - code_snippet: str = None, - node_type: str = None, - parents: list = list(), - children: list = list(), - metadata : dict = dict(), - start_point : tuple[int,int] = (None, None), - end_point : tuple[int,int] = (None, None)) -> None: - + def __init__( + self, + id: int = 0, + code_snippet: str = None, + node_type: str = None, + parents: list = list(), + children: list = list(), + metadata: dict = dict(), + start_point: tuple[int, int] = (None, None), + end_point: tuple[int, int] = (None, None), + ) -> None: + self.id = id self.code_snippet = code_snippet self.node_type = node_type @@ -50,13 +54,23 @@ def __init__(self, def __str__(self) -> str: return f"ID: {self.id}, Type: {self.node_type}, Snippet: {repr(self.code_snippet)}, Parents: {self.parents}, Children: {self.children}, Metadata = {self.metadata}" - + def __repr__(self) -> str: return f"ID: {self.id}, Type: {self.node_type}, Snippet: {repr(self.code_snippet)}, Parents: {self.parents}, Children: {self.children}, Metadata = {self.metadata}" - + def __eq__(self, other) -> bool: - return self.id == other.id and self.code_snippet == other.code_snippet and self.node_type == other.node_type and self.parents == other.parents and self.children == other.children and self.metadata == other.metadata and self.start_point == other.start_point and self.end_point == other.end_point - + return ( + self.id == other.id + and self.code_snippet == other.code_snippet + and self.node_type == other.node_type + and self.parents == other.parents + and self.children == other.children + and self.metadata == other.metadata + and self.start_point == other.start_point + and self.end_point == other.end_point + ) + + class UASTEdge: """ Represents an edge in the UAST (Universal Abstract Syntax Tree). @@ -68,11 +82,13 @@ class UASTEdge: metadata (dict): The metadata information associated with the edge. """ - def __init__(self, - start_id: int = None, - end_id: int = None, - directed_relation: str = None, - metadata : dict = dict()): + def __init__( + self, + start_id: int = None, + end_id: int = None, + directed_relation: str = None, + metadata: dict = dict(), + ): self.start_id = start_id self.end_id = end_id @@ -81,16 +97,22 @@ def __init__(self, def __str__(self) -> str: return f"Start: {self.start_id}, End: {self.end_id}, Relation: {self.directed_relation}, Metadata = {self.metadata}, Metadata: {self.metadata}" - + def __repr__(self) -> str: return f"Start: {self.start_id}, End: {self.end_id}, Relation: {self.directed_relation}, Metadata = {self.metadata}, Metadata: {self.metadata}" - + def __eq__(self, other) -> bool: - return self.start_id == other.start_id and self.end_id == other.end_id and self.directed_relation == other.directed_relation and self.metadata == other.metadata - + return ( + self.start_id == other.start_id + and self.end_id == other.end_id + and self.directed_relation == other.directed_relation + and self.metadata == other.metadata + ) + def __hash__(self) -> int: return hash((self.start_id, self.end_id, self.directed_relation, self.metadata)) - + + class UAST: """ Represents a graph of a Universal Abstract Syntax Tree (UAST). @@ -119,11 +141,12 @@ class UAST: load_from_file(file_path): Loads the UAST from a file in JSON format. visualize(): Visualizes the graph using NetworkX """ + def __init__(self): - self.nodes : dict[int,UASTNode] = dict() - self.edges : list[UASTEdge] = list() - self.assigned_id : int = 0 - self.nodes_of_type : dict = dict() + self.nodes: dict[int, UASTNode] = dict() + self.edges: list[UASTEdge] = list() + self.assigned_id: int = 0 + self.nodes_of_type: dict = dict() self.root = self._create_root() def __len__(self) -> int: @@ -131,69 +154,96 @@ def __len__(self) -> int: def __str__(self) -> str: return f"Nodes: {self.nodes} \nEdges: {self.edges}" - + def __repr__(self) -> str: return f"Nodes: {self.nodes} \nEdges: {self.edges}" - + def __eq__(self, other) -> bool: return self.nodes == other.nodes and self.edges == other.edges - def add_node(self, node : UASTNode) -> None: + def add_node(self, node: UASTNode) -> None: self.nodes[self.assigned_id] = node self.assigned_id += 1 - if node.node_type not in self.nodes_of_type : + if node.node_type not in self.nodes_of_type: self.nodes_of_type[node.node_type] = list() self.nodes_of_type[node.node_type].append(node.id) return def _create_root(self) -> UASTNode: - return self.create_node(node_type = "uast_root", code_snippet = "root", metadata= {"info" : "links to all"}, start_point = (-1,0), end_point = (-1,3)) - - def create_node(self, - node_type : str = None, - code_snippet : str = None, - metadata : dict = dict(), - start_point : tuple[int,int] = (None, None), - end_point : tuple[int,int] = (None, None)) -> UASTNode: - - node = UASTNode(id = self.assigned_id, node_type = node_type, code_snippet = code_snippet, metadata = metadata, start_point = start_point, end_point = end_point, children= list(), parents = list()) + return self.create_node( + node_type="uast_root", + code_snippet="root", + metadata={"info": "links to all"}, + start_point=(-1, 0), + end_point=(-1, 3), + ) + + def create_node( + self, + node_type: str = None, + code_snippet: str = None, + metadata: dict = dict(), + start_point: tuple[int, int] = (None, None), + end_point: tuple[int, int] = (None, None), + ) -> UASTNode: + + node = UASTNode( + id=self.assigned_id, + node_type=node_type, + code_snippet=code_snippet, + metadata=metadata, + start_point=start_point, + end_point=end_point, + children=list(), + parents=list(), + ) self.add_node(node) return node - - def add_edge(self, node1 : UASTNode = None, node2 : UASTNode = None, directed_relation : str = None, metadata : dict = dict())-> UASTEdge: - edge = UASTEdge(start_id = node1.id, end_id = node2.id, directed_relation = directed_relation, metadata = metadata) + + def add_edge( + self, + node1: UASTNode = None, + node2: UASTNode = None, + directed_relation: str = None, + metadata: dict = dict(), + ) -> UASTEdge: + edge = UASTEdge( + start_id=node1.id, + end_id=node2.id, + directed_relation=directed_relation, + metadata=metadata, + ) node2.parents.append(node1.id) node1.children.append(node2.id) self.edges.append(edge) return edge - - def get_node(self, id : int) -> UASTNode: + + def get_node(self, id: int) -> UASTNode: return self.nodes[id] - - def get_nodes_of_type(self, node_type : str) -> list[int]: + + def get_nodes_of_type(self, node_type: str) -> list[int]: return self.nodes_of_type[node_type] - - def get_children(self, node : UASTNode) -> list[int]: + + def get_children(self, node: UASTNode) -> list[int]: return node.children - - def get_parents(self, node : UASTNode) -> int: + + def get_parents(self, node: UASTNode) -> int: return node.parents def print_graph(self, id): if id not in self.nodes: return visited = set() - + def dfs(id, visited): visited.add(id) print(self.nodes[id]) for child in self.nodes[id].children: if child not in visited: dfs(child, visited) - + dfs(id, visited) del visited - def save_to_file(self, file_path): # convert children list to list for serialization @@ -203,17 +253,16 @@ def save_to_file(self, file_path): v.parents = list(v.parents) copy_nodes[k] = v - data = { "nodes": {str(k): v.__dict__ for k, v in self.nodes.items()}, - "edges": [edge.__dict__ for edge in self.edges] + "edges": [edge.__dict__ for edge in self.edges], } - with open(file_path, 'w') as f: - json.dump(data, f, indent= 4) + with open(file_path, "w") as f: + json.dump(data, f, indent=4) return - + def get_json(self): copy_nodes = self.nodes.copy() @@ -224,11 +273,11 @@ def get_json(self): data = { "nodes": {str(k): v.__dict__ for k, v in self.nodes.items()}, - "edges": [edge.__dict__ for edge in self.edges] + "edges": [edge.__dict__ for edge in self.edges], } - + return data - + def load_from_json_string(self, obj: str): data = json.loads(obj) self.nodes = {int(k): UASTNode(**v) for k, v in data["nodes"].items()} @@ -237,10 +286,10 @@ def load_from_json_string(self, obj: str): for node in self.nodes.values(): node.start_point = tuple(node.start_point) node.end_point = tuple(node.end_point) - return + return def load_from_file(self, file_path): - with open(file_path, 'r') as f: + with open(file_path, "r") as f: data = json.load(f) self.nodes = {int(k): UASTNode(**v) for k, v in data["nodes"].items()} self.edges = [UASTEdge(**edge) for edge in data["edges"]] @@ -248,7 +297,7 @@ def load_from_file(self, file_path): for node in self.nodes.values(): node.start_point = tuple(node.start_point) node.end_point = tuple(node.end_point) - return + return def visualize(self): edges_viz = [] @@ -258,13 +307,18 @@ def visualize(self): labeldict[edge.start_id] = self.nodes[edge.start_id].node_type labeldict[edge.end_id] = self.nodes[edge.end_id].node_type print(labeldict) - plt.figure(figsize=(10,10)) + plt.figure(figsize=(10, 10)) plt.rcParams["font.size"] = 20 - G = networkx.Graph() - G.add_edges_from(edges_viz) + G = networkx.Graph() + G.add_edges_from(edges_viz) pos = networkx.spring_layout(G) - networkx.draw_networkx_labels(G, pos, labels= labeldict, font_size= 12, ) - networkx.draw_networkx_nodes(G, pos, nodelist= self.nodes.keys(), node_size= 300) - networkx.draw_networkx_edges(G, pos, edgelist= edges_viz) - plt.show() + networkx.draw_networkx_labels( + G, + pos, + labels=labeldict, + font_size=12, + ) + networkx.draw_networkx_nodes(G, pos, nodelist=self.nodes.keys(), node_size=300) + networkx.draw_networkx_edges(G, pos, edgelist=edges_viz) + plt.show() return diff --git a/transforms/code/code_profiler/python/src/UAST_parser.py b/transforms/code/code_profiler/dpk_code_profiler/UAST_parser.py similarity index 73% rename from transforms/code/code_profiler/python/src/UAST_parser.py rename to transforms/code/code_profiler/dpk_code_profiler/UAST_parser.py index ec0441df7..ce62a705c 100644 --- a/transforms/code/code_profiler/python/src/UAST_parser.py +++ b/transforms/code/code_profiler/dpk_code_profiler/UAST_parser.py @@ -10,26 +10,31 @@ # limitations under the License. ################################################################################ -from UAST import UAST import json -from tree_sitter import Tree import os import sys + +from tree_sitter import Tree +from UAST import UAST + + sys.setrecursionlimit(10000) """ Initialize the parser with a path for rules and grammar. """ -class UASTParser(): + + +class UASTParser: def __init__(self): - self.language : str = None - self.uast : UAST = None - self.rules : dict = None + self.language: str = None + self.uast: UAST = None + self.rules: dict = None self.cached_rules = dict() # Compute the absolute path to the tree-sitter-bindings directory grammar_dir = os.path.dirname(os.path.abspath(__file__)) - self.grammar_path = os.path.join(grammar_dir, '..', '..', 'python', 'src', 'grammar', 'UAST_Grammar.json') + self.grammar_path = os.path.join(grammar_dir, "..", "..", "python", "src", "grammar", "UAST_Grammar.json") if not os.path.exists(self.grammar_path): print("Current working directory:", os.getcwd()) @@ -40,65 +45,71 @@ def __init__(self): # Compute the absolute path to the ruleset directory based on the script's location script_dir = os.path.dirname(os.path.abspath(__file__)) - self.rule_directory = os.path.join(script_dir, 'ruleset/') - + self.rule_directory = os.path.join(script_dir, "ruleset/") + if not os.path.isdir(self.rule_directory): print("Script directory:", script_dir) raise FileNotFoundError(f"Ruleset directory not found at {self.rule_directory}. Please ensure it exists.") - ''' + """ # Rule directory and file self.rule_directory = "../../python/src/ruleset/" if not os.path.isdir(self.rule_directory): print("Current working directory:", os.getcwd()) raise FileNotFoundError(f"Ruleset directory not found at {self.rule_directory}. Please ensure it exists.") - ''' + """ self.rule_file_name: str = "UAST_rules_" - self.AST : Tree = None + self.AST: Tree = None # self.offset : int = None # self.prev_line : int = -1 - self.extracted : str = None + self.extracted: str = None self.function_info = dict() self.class_info = dict() - self.user_defined_entity = {"uast_function": "self.function_info[snippet] = id", - "uast_class": "self.class_info[snippet] = id"} - + self.user_defined_entity = { + "uast_function": "self.function_info[snippet] = id", + "uast_class": "self.class_info[snippet] = id", + } def set_rule_dir_path(self, path: str): self.rule_directory = path - def set_grammar_path(self, path : str): + def set_grammar_path(self, path: str): self.grammar_path = path self.grammar = json.load(open(self.grammar_path, "r")) # set language for the parser - def set_language(self, language : str): + def set_language(self, language: str): self.language = language - if (language not in self.cached_rules): - rules_cache = json.load(open(self.rule_directory + self.rule_file_name + self.language + '.json', "r")) + if language not in self.cached_rules: + rules_cache = json.load( + open( + self.rule_directory + self.rule_file_name + self.language + ".json", + "r", + ) + ) self.cached_rules[language] = rules_cache - + self.rules = self.cached_rules[language] # initialise a DFS traversal on the AST and an empty UAST. - def parse(self, AST, code_snippet) : - if(self.language == None) : + def parse(self, AST, code_snippet): + if self.language == None: print("Language not loaded") return self.AST = AST self.uast = UAST() self.uast.root.metadata["language"] = self.language self.uast.root.metadata["loc_snippet"] = self.count_loc(code_snippet, self.language) - self._dfs(AST_node = self.AST.root_node, parent = self.uast.root) - ''' + self._dfs(AST_node=self.AST.root_node, parent=self.uast.root) + """ # commenting this block temporarily # Call the new modularized function to calculate the code-to-comment ratio code_to_comment_ratio = self.calculate_code_to_comment_ratio(self.uast.root) # Add the code_to_comment_ratio to the root node's metadata self.uast.root.metadata["code_to_comment_ratio"] = code_to_comment_ratio - ''' + """ return self.uast def calculate_code_to_comment_ratio(self, root_node): @@ -129,9 +140,9 @@ def sum_comment_loc(node): return loc_snippet / total_comment_loc else: return None # Handle no comments - + def count_lo_comments(self, code_snippet): - lines = code_snippet.split('\n') + lines = code_snippet.split("\n") loc_count = 0 for line in lines: stripped_line = line.strip() @@ -143,47 +154,49 @@ def count_lo_comments(self, code_snippet): def count_loc(self, code_snippet, language): # Define the comment markers for each language language_comment_markers = { - "c": ('//', '/*', '*/'), - "java": ('//', '/*', '*/'), - "C#": ('//', '/*', '*/'), - "c_sharp": ('//', '/*', '*/'), - "cpp": ('//', '/*', '*/'), - "objc": ('//', '/*', '*/'), - "rust": ('//', '/*', '*/'), - "go": ('//', '/*', '*/'), - "kotlin": ('//', '/*', '*/'), - "VHDL": ('--', None, None), - "py": ('#', '"""', '"""'), - "js": ('//', '/*', '*/'), - "dart": ('//', '/*', '*/'), - "QML": ('//', None, None), - "typescript": ('//', '/*', '*/'), - "perl": ('#', None, None), - "haskell": ('--', '{-', '-}'), - "elm": ('--', '{-', '-}'), - "agda": ('--', '{-', '-}'), - "d": ('//', '/*', '*/'), - "nim": ('#', '##', None), - "ocaml": ('(*', '(*', '*)'), - "scala": ('//', '/*', '*/') + "c": ("//", "/*", "*/"), + "java": ("//", "/*", "*/"), + "C#": ("//", "/*", "*/"), + "c_sharp": ("//", "/*", "*/"), + "cpp": ("//", "/*", "*/"), + "objc": ("//", "/*", "*/"), + "rust": ("//", "/*", "*/"), + "go": ("//", "/*", "*/"), + "kotlin": ("//", "/*", "*/"), + "VHDL": ("--", None, None), + "py": ("#", '"""', '"""'), + "js": ("//", "/*", "*/"), + "dart": ("//", "/*", "*/"), + "QML": ("//", None, None), + "typescript": ("//", "/*", "*/"), + "perl": ("#", None, None), + "haskell": ("--", "{-", "-}"), + "elm": ("--", "{-", "-}"), + "agda": ("--", "{-", "-}"), + "d": ("//", "/*", "*/"), + "nim": ("#", "##", None), + "ocaml": ("(*", "(*", "*)"), + "scala": ("//", "/*", "*/"), } - - single_line_comment, multi_line_comment_start, multi_line_comment_end = language_comment_markers.get(language, (None, None, None)) - + + single_line_comment, multi_line_comment_start, multi_line_comment_end = language_comment_markers.get( + language, (None, None, None) + ) + if not single_line_comment: raise ValueError(f"Unsupported language: {language}") - - lines = code_snippet.split('\n') + + lines = code_snippet.split("\n") loc_count = 0 inside_multiline_comment = False for line in lines: stripped_line = line.strip() - + # Skip empty lines if not stripped_line: continue - + # Handle multi-line comments if multi_line_comment_start and multi_line_comment_end: if inside_multiline_comment: @@ -195,11 +208,11 @@ def count_loc(self, code_snippet, language): # If the line starts a multi-line comment inside_multiline_comment = True continue - + # Skip single-line comments if stripped_line.startswith(single_line_comment): continue - + # If the line is neither a comment nor blank, count it as LOC loc_count += 1 @@ -208,9 +221,9 @@ def count_loc(self, code_snippet, language): def _add_user_defined(self, node): id = node.id type = node.node_type - + if node.code_snippet is not None: - snippet = node.code_snippet.replace(type, '').strip() + snippet = node.code_snippet.replace(type, "").strip() # Add further processing with the snippet else: # Handle the case where code_snippet is None @@ -218,7 +231,7 @@ def _add_user_defined(self, node): # You can log a warning or take other appropriate action print(f"Warning: node.code_snippet is None for node type: {type}") - if (type in self.user_defined_entity): + if type in self.user_defined_entity: exec(self.user_defined_entity[type]) node.metadata["user_defined"] = True @@ -230,11 +243,11 @@ def _add_user_defined(self, node): # Traversing through the AST to create nodes recursively. def _dfs(self, AST_node, parent): - if (AST_node.type in self.rules): + if AST_node.type in self.rules: ast_snippet = AST_node.text.decode("utf8") node_type = self.rules[AST_node.type]["uast_node_type"] exec_string = self.rules[AST_node.type]["extractor"] - uast_snippet = self._extract(ast_snippet = ast_snippet, node_type = node_type, exec_string = exec_string) + uast_snippet = self._extract(ast_snippet=ast_snippet, node_type=node_type, exec_string=exec_string) if node_type == "uast_comment": loc_original_code = self.count_lo_comments(ast_snippet) @@ -242,21 +255,21 @@ def _dfs(self, AST_node, parent): loc_original_code = self.count_loc(ast_snippet, self.language) node = self.uast.create_node( - node_type = node_type, - code_snippet = uast_snippet, - # choose to enable or disbale the storage of original code by removing the following line. - metadata = { - "original_code" : ast_snippet, - "loc_original_code": loc_original_code + node_type=node_type, + code_snippet=uast_snippet, + # choose to enable or disbale the storage of original code by removing the following line. + metadata={ + "original_code": ast_snippet, + "loc_original_code": loc_original_code, }, ) self._add_user_defined(node) - self.uast.add_edge(node1 = parent, node2 = node, directed_relation = "parent_node") + self.uast.add_edge(node1=parent, node2=node, directed_relation="parent_node") parent = node for child in AST_node.children: try: - self._dfs(AST_node= child, parent = parent) + self._dfs(AST_node=child, parent=parent) except RecursionError as e: print(f"RecursionError caught: {str(e)}") @@ -267,20 +280,22 @@ def _extract(self, ast_snippet, node_type, exec_string): except Exception as e: print(e) try: - return self.grammar[node_type]["keyword"] + " " + self.extracted + return self.grammar[node_type]["keyword"] + " " + self.extracted except Exception as e: print(e) + def uast_read(jsonstring): """ Reads an input json string into UAST class object """ uast = UAST() - if jsonstring is not None and jsonstring != 'null': + if jsonstring is not None and jsonstring != "null": uast.load_from_json_string(jsonstring) return uast return None + def extract_ccr(uast): """ Calculates the code to comment ratio given an UAST object as input @@ -289,12 +304,12 @@ def extract_ccr(uast): total_comment_loc = 0 for node_idx in uast.nodes: node = uast.get_node(node_idx) - if node.node_type == 'uast_comment': + if node.node_type == "uast_comment": total_comment_loc += node.metadata.get("loc_original_code", 0) - elif node.node_type == 'uast_root': + elif node.node_type == "uast_root": loc_snippet = node.metadata.get("loc_snippet", 0) if total_comment_loc > 0: return loc_snippet / total_comment_loc else: - return None + return None return None diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/__init__.py similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/__init__.py diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/agda/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/agda/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/agda/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/agda/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/c/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/c/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/c/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/c/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/c_sharp/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/c_sharp/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/c_sharp/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/c_sharp/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/cpp/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/cpp/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/cpp/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/cpp/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/d/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/d/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/d/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/d/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/dart/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/dart/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/dart/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/dart/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/dart/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/dart/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/dart/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/dart/1.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/elm/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/elm/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/elm/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/elm/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/elm/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/elm/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/elm/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/elm/1.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/go/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/go/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/go/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/go/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/haskell/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/haskell/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/haskell/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/haskell/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/java/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/java/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/java/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/java/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/java/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/java/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/java/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/java/1.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/js/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/js/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/js/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/js/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/kotlin/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/kotlin/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/kotlin/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/kotlin/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/kotlin/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/kotlin/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/kotlin/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/kotlin/1.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/nim/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/nim/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/nim/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/nim/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/nim/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/nim/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/nim/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/nim/1.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/objc/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/objc/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/objc/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/objc/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/ocaml/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/ocaml/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/ocaml/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/ocaml/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/py/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/py/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/py/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/py/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/qmljs/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/qmljs/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/qmljs/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/qmljs/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/rust/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/rust/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/rust/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/rust/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/scala/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/scala/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/scala/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/scala/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/scala/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/scala/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/scala/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/scala/1.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/ts/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/ts/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/ts/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/ts/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/verilog/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/verilog/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/verilog/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/verilog/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/vhdl/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/vhdl/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_comment/vhdl/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_comment/vhdl/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/agda/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/agda/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/agda/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/agda/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/c/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/c/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/c/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/c/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/c_sharp/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/c_sharp/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/c_sharp/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/c_sharp/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/cpp/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/cpp/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/cpp/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/cpp/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/cpp/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/cpp/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/cpp/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/cpp/1.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/cpp/2.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/cpp/2.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/cpp/2.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/cpp/2.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/cpp/3.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/cpp/3.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/cpp/3.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/cpp/3.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/d/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/d/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/d/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/d/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/dart/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/dart/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/dart/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/dart/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/elm/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/elm/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/elm/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/elm/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/go/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/go/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/go/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/go/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/haskell/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/haskell/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/haskell/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/haskell/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/java/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/java/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/java/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/java/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/js/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/js/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/js/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/js/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/kotlin/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/kotlin/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/kotlin/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/kotlin/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/nim/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/nim/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/nim/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/nim/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/objc/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/objc/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/objc/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/objc/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/perl/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/perl/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/perl/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/perl/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/py/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/py/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/py/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/py/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/rust/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/rust/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/rust/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/rust/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/scala/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/scala/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/scala/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/scala/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/scala/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/scala/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/scala/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/scala/1.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/ts/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/ts/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/ts/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/ts/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/verilog/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/verilog/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/verilog/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/verilog/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/vhdl/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/vhdl/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_function/vhdl/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_function/vhdl/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/agda/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/agda/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/agda/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/agda/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/c/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/c/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/c/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/c/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/c_sharp/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/c_sharp/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/c_sharp/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/c_sharp/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/c_sharp/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/c_sharp/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/c_sharp/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/c_sharp/1.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/cpp/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/cpp/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/cpp/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/cpp/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/cpp/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/cpp/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/cpp/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/cpp/1.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/cpp/2.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/cpp/2.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/cpp/2.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/cpp/2.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/cpp/3.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/cpp/3.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/cpp/3.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/cpp/3.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/cpp/4.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/cpp/4.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/cpp/4.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/cpp/4.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/cpp/5.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/cpp/5.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/cpp/5.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/cpp/5.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/cpp/6.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/cpp/6.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/cpp/6.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/cpp/6.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/d/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/d/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/d/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/d/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/dart/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/dart/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/dart/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/dart/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/elm/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/elm/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/elm/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/elm/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/go/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/go/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/go/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/go/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/haskell/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/haskell/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/haskell/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/haskell/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/java/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/java/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/java/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/java/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/js/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/js/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/js/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/js/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/kotlin/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/kotlin/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/kotlin/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/kotlin/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/nim/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/nim/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/nim/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/nim/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/nim/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/nim/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/nim/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/nim/1.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/nim/2.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/nim/2.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/nim/2.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/nim/2.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/objc/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/objc/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/objc/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/objc/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/objc/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/objc/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/objc/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/objc/1.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/ocaml/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/ocaml/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/ocaml/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/ocaml/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/perl/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/perl/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/perl/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/perl/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/py/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/py/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/py/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/py/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/py/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/py/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/py/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/py/1.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/qmljs/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/qmljs/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/qmljs/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/qmljs/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/rust/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/rust/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/rust/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/rust/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/scala/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/scala/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/scala/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/scala/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/scala/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/scala/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/scala/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/scala/1.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/ts/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/ts/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/ts/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/ts/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/verilog/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/verilog/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/verilog/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/verilog/0.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/verilog/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/verilog/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/verilog/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/verilog/1.txt diff --git a/transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/vhdl/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/vhdl/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/Concept_dataset/uast_package/vhdl/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/Concept_dataset/uast_package/vhdl/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/agda/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/agda/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/agda/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/agda/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/c/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/c/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/c/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/c/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/c_sharp/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/c_sharp/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/c_sharp/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/c_sharp/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/cpp/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/cpp/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/cpp/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/cpp/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/d/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/d/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/d/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/d/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/dart/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/dart/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/dart/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/dart/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/dart/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/dart/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/dart/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/dart/1.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/elm/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/elm/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/elm/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/elm/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/elm/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/elm/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/elm/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/elm/1.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/go/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/go/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/go/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/go/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/haskell/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/haskell/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/haskell/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/haskell/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/java/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/java/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/java/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/java/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/java/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/java/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/java/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/java/1.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/js/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/js/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/js/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/js/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/kotlin/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/kotlin/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/kotlin/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/kotlin/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/kotlin/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/kotlin/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/kotlin/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/kotlin/1.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/nim/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/nim/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/nim/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/nim/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/nim/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/nim/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/nim/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/nim/1.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/objc/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/objc/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/objc/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/objc/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/ocaml/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/ocaml/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/ocaml/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/ocaml/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/py/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/py/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/py/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/py/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/qmljs/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/qmljs/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/qmljs/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/qmljs/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/rust/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/rust/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/rust/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/rust/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/scala/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/scala/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/scala/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/scala/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/scala/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/scala/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/scala/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/scala/1.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/ts/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/ts/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/ts/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/ts/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/verilog/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/verilog/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/verilog/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/verilog/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/vhdl/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/vhdl/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_comment/vhdl/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_comment/vhdl/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/agda/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/agda/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/agda/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/agda/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/c/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/c/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/c/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/c/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/c_sharp/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/c_sharp/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/c_sharp/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/c_sharp/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/cpp/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/cpp/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/cpp/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/cpp/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/cpp/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/cpp/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/cpp/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/cpp/1.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/cpp/2.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/cpp/2.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/cpp/2.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/cpp/2.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/cpp/3.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/cpp/3.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/cpp/3.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/cpp/3.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/d/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/d/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/d/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/d/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/dart/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/dart/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/dart/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/dart/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/elm/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/elm/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/elm/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/elm/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/elm/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/elm/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/elm/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/elm/1.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/go/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/go/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/go/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/go/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/haskell/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/haskell/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/haskell/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/haskell/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/java/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/java/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/java/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/java/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/js/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/js/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/js/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/js/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/kotlin/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/kotlin/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/kotlin/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/kotlin/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/nim/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/nim/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/nim/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/nim/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/objc/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/objc/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/objc/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/objc/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/perl/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/perl/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/perl/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/perl/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/py/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/py/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/py/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/py/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/rust/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/rust/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/rust/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/rust/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/scala/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/scala/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/scala/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/scala/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/scala/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/scala/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/scala/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/scala/1.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/ts/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/ts/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/ts/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/ts/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/verilog/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/verilog/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/verilog/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/verilog/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/vhdl/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/vhdl/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_function/vhdl/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_function/vhdl/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/agda/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/agda/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/agda/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/agda/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/c/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/c/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/c/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/c/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/c_sharp/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/c_sharp/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/c_sharp/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/c_sharp/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/c_sharp/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/c_sharp/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/c_sharp/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/c_sharp/1.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/cpp/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/cpp/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/cpp/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/cpp/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/cpp/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/cpp/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/cpp/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/cpp/1.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/cpp/2.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/cpp/2.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/cpp/2.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/cpp/2.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/cpp/3.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/cpp/3.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/cpp/3.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/cpp/3.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/cpp/4.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/cpp/4.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/cpp/4.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/cpp/4.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/cpp/5.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/cpp/5.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/cpp/5.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/cpp/5.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/cpp/6.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/cpp/6.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/cpp/6.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/cpp/6.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/d/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/d/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/d/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/d/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/dart/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/dart/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/dart/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/dart/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/elm/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/elm/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/elm/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/elm/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/go/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/go/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/go/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/go/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/haskell/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/haskell/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/haskell/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/haskell/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/java/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/java/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/java/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/java/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/js/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/js/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/js/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/js/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/kotlin/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/kotlin/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/kotlin/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/kotlin/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/nim/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/nim/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/nim/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/nim/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/nim/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/nim/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/nim/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/nim/1.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/nim/2.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/nim/2.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/nim/2.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/nim/2.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/objc/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/objc/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/objc/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/objc/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/objc/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/objc/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/objc/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/objc/1.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/ocaml/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/ocaml/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/ocaml/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/ocaml/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/perl/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/perl/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/perl/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/perl/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/py/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/py/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/py/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/py/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/py/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/py/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/py/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/py/1.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/qmljs/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/qmljs/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/qmljs/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/qmljs/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/rust/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/rust/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/rust/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/rust/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/scala/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/scala/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/scala/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/scala/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/scala/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/scala/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/scala/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/scala/1.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/ts/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/ts/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/ts/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/ts/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/verilog/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/verilog/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/verilog/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/verilog/0.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/verilog/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/verilog/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/verilog/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/verilog/1.txt diff --git a/transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/vhdl/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/vhdl/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/few_shot_outputs/uast_package/vhdl/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/few_shot_outputs/uast_package/vhdl/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/agda/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/agda/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/agda/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/agda/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/agda/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/agda/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/agda/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/agda/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/agda/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/agda/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/agda/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/agda/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/agda/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/agda/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/agda/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/agda/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/c/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/c/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/c/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/c/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/c/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/c/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/c/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/c/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/c/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/c/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/c/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/c/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/c/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/c/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/c/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/c/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/c_sharp/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/c_sharp/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/c_sharp/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/c_sharp/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/c_sharp/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/c_sharp/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/c_sharp/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/c_sharp/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/c_sharp/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/c_sharp/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/c_sharp/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/c_sharp/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/c_sharp/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/c_sharp/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/c_sharp/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/c_sharp/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/cpp/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/cpp/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/cpp/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/cpp/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/d/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/d/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/d/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/d/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/d/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/d/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/d/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/d/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/d/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/d/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/d/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/d/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/d/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/d/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/d/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/d/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/dart/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/dart/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/dart/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/dart/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/dart/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/dart/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/dart/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/dart/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/dart/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/dart/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/dart/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/dart/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/dart/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/dart/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/dart/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/dart/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/elm/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/elm/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/elm/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/elm/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/elm/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/elm/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/elm/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/elm/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/elm/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/elm/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/elm/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/elm/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/elm/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/elm/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/elm/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/elm/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/go/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/go/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/go/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/go/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/go/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/go/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/go/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/go/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/go/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/go/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/go/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/go/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/go/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/go/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/go/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/go/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/haskell/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/haskell/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/haskell/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/haskell/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/haskell/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/haskell/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/haskell/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/haskell/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/haskell/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/haskell/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/haskell/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/haskell/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/haskell/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/haskell/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/haskell/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/haskell/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/java/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/java/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/java/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/java/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/java/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/java/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/java/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/java/1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/js/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/js/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/js/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/js/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/js/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/js/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/js/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/js/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/kotlin/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/kotlin/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/kotlin/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/kotlin/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/kotlin/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/kotlin/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/kotlin/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/kotlin/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/kotlin/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/kotlin/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/kotlin/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/kotlin/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/kotlin/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/kotlin/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/kotlin/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/kotlin/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/nim/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/nim/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/nim/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/nim/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/nim/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/nim/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/nim/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/nim/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/nim/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/nim/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/nim/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/nim/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/nim/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/nim/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/nim/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/nim/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/objc/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/objc/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/objc/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/objc/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/objc/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/objc/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/objc/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/objc/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/objc/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/objc/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/objc/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/objc/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/objc/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/objc/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/objc/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/objc/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/ocaml/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/ocaml/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/ocaml/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/ocaml/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/ocaml/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/ocaml/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/ocaml/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/ocaml/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/ocaml/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/ocaml/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/ocaml/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/ocaml/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/ocaml/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/ocaml/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/ocaml/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/ocaml/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/py/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/py/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/py/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/py/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/qmljs/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/qmljs/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/qmljs/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/qmljs/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/qmljs/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/qmljs/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/qmljs/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/qmljs/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/qmljs/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/qmljs/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/qmljs/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/qmljs/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/qmljs/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/qmljs/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/qmljs/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/qmljs/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/rust/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/rust/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/rust/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/rust/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/rust/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/rust/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/rust/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/rust/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/rust/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/rust/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/rust/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/rust/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/rust/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/rust/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/rust/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/rust/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/scala/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/scala/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/scala/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/scala/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/scala/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/scala/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/scala/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/scala/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/scala/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/scala/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/scala/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/scala/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/scala/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/scala/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/scala/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/scala/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/ts/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/ts/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/ts/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/ts/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/ts/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/ts/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/ts/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/ts/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/ts/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/ts/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/ts/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/ts/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/ts/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/ts/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/ts/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/ts/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/verilog/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/verilog/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/verilog/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/verilog/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/verilog/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/verilog/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/verilog/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/verilog/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/verilog/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/verilog/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/verilog/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/verilog/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/verilog/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/verilog/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/verilog/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/verilog/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/vhdl/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/vhdl/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/vhdl/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/vhdl/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/vhdl/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/vhdl/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/vhdl/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/vhdl/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/vhdl/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/vhdl/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/vhdl/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/vhdl/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/vhdl/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/vhdl/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/comment/vhdl/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/comment/vhdl/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/agda/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/agda/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/agda/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/agda/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/agda/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/agda/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/agda/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/agda/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/agda/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/agda/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/agda/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/agda/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/agda/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/agda/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/agda/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/agda/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/c/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/c/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/c/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/c/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/c/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/c/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/c/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/c/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/c/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/c/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/c/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/c/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/c/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/c/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/c/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/c/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/c_sharp/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/c_sharp/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/c_sharp/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/c_sharp/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/c_sharp/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/c_sharp/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/c_sharp/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/c_sharp/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/c_sharp/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/c_sharp/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/c_sharp/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/c_sharp/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/c_sharp/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/c_sharp/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/c_sharp/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/c_sharp/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/2.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/2.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/2.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/2.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/3.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/3.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/3.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/3.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/example_languages_1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/example_languages_1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/example_languages_1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/example_languages_1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/example_languages_2.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/example_languages_2.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/example_languages_2.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/example_languages_2.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/example_languages_3.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/example_languages_3.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/example_languages_3.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/example_languages_3.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/prompt_1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/prompt_1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/prompt_1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/prompt_1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/prompt_2.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/prompt_2.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/prompt_2.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/prompt_2.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/prompt_3.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/prompt_3.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/prompt_3.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/prompt_3.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/test_code_1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/test_code_1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/test_code_1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/test_code_1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/test_code_2.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/test_code_2.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/test_code_2.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/test_code_2.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/test_code_3.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/test_code_3.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/cpp/test_code_3.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/cpp/test_code_3.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/d/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/d/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/d/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/d/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/d/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/d/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/d/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/d/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/d/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/d/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/d/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/d/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/d/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/d/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/d/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/d/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/dart/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/dart/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/dart/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/dart/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/dart/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/dart/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/dart/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/dart/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/dart/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/dart/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/dart/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/dart/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/dart/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/dart/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/dart/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/dart/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/elm/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/elm/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/elm/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/elm/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/elm/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/elm/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/elm/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/elm/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/elm/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/elm/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/elm/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/elm/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/elm/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/elm/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/elm/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/elm/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/go/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/go/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/go/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/go/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/go/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/go/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/go/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/go/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/go/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/go/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/go/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/go/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/go/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/go/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/go/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/go/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/haskell/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/haskell/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/haskell/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/haskell/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/haskell/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/haskell/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/haskell/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/haskell/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/haskell/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/haskell/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/haskell/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/haskell/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/haskell/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/haskell/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/haskell/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/haskell/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/java/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/java/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/java/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/java/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/js/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/js/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/js/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/js/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/js/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/js/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/js/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/js/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/kotlin/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/kotlin/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/kotlin/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/kotlin/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/kotlin/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/kotlin/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/kotlin/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/kotlin/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/kotlin/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/kotlin/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/kotlin/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/kotlin/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/kotlin/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/kotlin/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/kotlin/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/kotlin/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/nim/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/nim/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/nim/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/nim/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/nim/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/nim/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/nim/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/nim/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/nim/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/nim/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/nim/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/nim/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/nim/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/nim/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/nim/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/nim/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/objc/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/objc/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/objc/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/objc/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/objc/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/objc/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/objc/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/objc/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/objc/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/objc/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/objc/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/objc/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/objc/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/objc/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/objc/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/objc/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/perl/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/perl/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/perl/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/perl/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/perl/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/perl/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/perl/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/perl/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/perl/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/perl/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/perl/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/perl/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/perl/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/perl/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/perl/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/perl/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/py/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/py/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/py/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/py/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/rust/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/rust/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/rust/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/rust/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/rust/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/rust/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/rust/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/rust/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/rust/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/rust/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/rust/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/rust/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/rust/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/rust/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/rust/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/rust/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/scala/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/scala/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/scala/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/scala/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/scala/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/scala/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/scala/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/scala/1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/scala/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/scala/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/scala/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/scala/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/scala/example_languages_1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/scala/example_languages_1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/scala/example_languages_1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/scala/example_languages_1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/scala/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/scala/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/scala/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/scala/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/scala/prompt_1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/scala/prompt_1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/scala/prompt_1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/scala/prompt_1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/scala/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/scala/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/scala/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/scala/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/scala/test_code_1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/scala/test_code_1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/scala/test_code_1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/scala/test_code_1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/ts/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/ts/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/ts/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/ts/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/ts/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/ts/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/ts/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/ts/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/verilog/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/verilog/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/verilog/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/verilog/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/verilog/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/verilog/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/verilog/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/verilog/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/verilog/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/verilog/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/verilog/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/verilog/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/verilog/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/verilog/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/verilog/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/verilog/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/vhdl/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/vhdl/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/vhdl/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/vhdl/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/vhdl/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/vhdl/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/vhdl/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/vhdl/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/vhdl/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/vhdl/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/vhdl/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/vhdl/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/function/vhdl/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/vhdl/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/function/vhdl/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/function/vhdl/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/agda/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/agda/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/agda/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/agda/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/agda/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/agda/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/agda/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/agda/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/agda/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/agda/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/agda/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/agda/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/agda/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/agda/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/agda/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/agda/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c_sharp/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c_sharp/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c_sharp/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c_sharp/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c_sharp/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c_sharp/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c_sharp/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c_sharp/1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c_sharp/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c_sharp/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c_sharp/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c_sharp/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c_sharp/example_languages_1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c_sharp/example_languages_1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c_sharp/example_languages_1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c_sharp/example_languages_1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c_sharp/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c_sharp/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c_sharp/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c_sharp/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c_sharp/prompt_1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c_sharp/prompt_1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c_sharp/prompt_1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c_sharp/prompt_1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c_sharp/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c_sharp/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c_sharp/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c_sharp/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c_sharp/test_code_1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c_sharp/test_code_1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/c_sharp/test_code_1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/c_sharp/test_code_1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/2.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/2.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/2.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/2.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/3.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/3.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/3.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/3.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/4.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/4.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/4.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/4.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/5.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/5.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/5.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/5.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/6.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/6.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/6.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/6.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/example_languages_1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/example_languages_1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/example_languages_1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/example_languages_1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/example_languages_2.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/example_languages_2.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/example_languages_2.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/example_languages_2.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/example_languages_3.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/example_languages_3.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/example_languages_3.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/example_languages_3.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/example_languages_4.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/example_languages_4.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/example_languages_4.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/example_languages_4.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/example_languages_5.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/example_languages_5.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/example_languages_5.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/example_languages_5.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/example_languages_6.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/example_languages_6.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/example_languages_6.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/example_languages_6.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/go/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/go/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/prompt_1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/prompt_1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/prompt_1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/prompt_1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/prompt_2.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/prompt_2.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/prompt_2.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/prompt_2.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/prompt_3.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/prompt_3.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/prompt_3.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/prompt_3.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/prompt_4.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/prompt_4.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/prompt_4.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/prompt_4.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/prompt_5.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/prompt_5.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/prompt_5.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/prompt_5.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/prompt_6.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/prompt_6.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/prompt_6.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/prompt_6.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/test_code_1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/test_code_1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/test_code_1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/test_code_1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/test_code_2.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/test_code_2.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/test_code_2.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/test_code_2.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/test_code_3.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/test_code_3.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/test_code_3.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/test_code_3.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/test_code_4.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/test_code_4.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/test_code_4.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/test_code_4.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/test_code_5.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/test_code_5.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/test_code_5.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/test_code_5.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/test_code_6.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/test_code_6.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/cpp/test_code_6.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/cpp/test_code_6.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/d/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/d/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/d/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/d/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/d/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/d/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/d/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/d/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/d/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/d/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/d/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/d/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/d/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/d/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/d/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/d/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/dart/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/dart/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/dart/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/dart/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/dart/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/dart/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/dart/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/dart/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/dart/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/dart/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/dart/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/dart/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/dart/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/dart/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/dart/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/dart/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/elm/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/elm/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/elm/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/elm/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/elm/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/elm/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/elm/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/elm/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/elm/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/elm/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/elm/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/elm/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/elm/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/elm/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/elm/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/elm/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/go/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/go/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/go/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/go/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/java/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/go/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/java/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/go/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/haskell/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/haskell/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/haskell/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/haskell/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/haskell/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/haskell/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/haskell/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/haskell/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/haskell/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/haskell/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/haskell/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/haskell/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/haskell/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/haskell/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/haskell/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/haskell/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/java/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/java/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/java/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/java/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/py/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/java/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/py/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/java/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/js/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/js/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/js/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/js/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/js/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/js/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/js/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/js/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/js/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/js/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/js/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/js/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/js/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/js/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/js/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/js/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/kotlin/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/kotlin/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/kotlin/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/kotlin/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/kotlin/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/kotlin/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/kotlin/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/kotlin/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/kotlin/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/kotlin/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/kotlin/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/kotlin/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/kotlin/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/kotlin/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/kotlin/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/kotlin/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/nim/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/nim/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/nim/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/nim/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/nim/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/nim/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/nim/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/nim/1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/nim/2.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/nim/2.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/nim/2.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/nim/2.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/nim/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/nim/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/nim/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/nim/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/nim/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/nim/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/nim/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/nim/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/nim/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/nim/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/nim/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/nim/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/nim/test_code_0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/nim/test_code_0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/nim/test_code_0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/nim/test_code_0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/nim/test_code_1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/nim/test_code_1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/nim/test_code_1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/nim/test_code_1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/nim/test_code_2.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/nim/test_code_2.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/nim/test_code_2.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/nim/test_code_2.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/objc/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/objc/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/objc/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/objc/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/objc/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/objc/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/objc/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/objc/1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/objc/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/objc/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/objc/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/objc/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/objc/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/objc/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/objc/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/objc/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/objc/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/objc/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/objc/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/objc/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/objc/test_code_1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/objc/test_code_1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/objc/test_code_1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/objc/test_code_1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/ocaml/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/ocaml/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/ocaml/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/ocaml/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/ocaml/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/ocaml/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/ocaml/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/ocaml/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/ocaml/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/ocaml/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/ocaml/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/ocaml/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/ocaml/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/ocaml/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/ocaml/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/ocaml/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/perl/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/perl/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/perl/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/perl/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/perl/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/perl/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/perl/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/perl/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/perl/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/perl/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/perl/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/perl/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/perl/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/perl/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/perl/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/perl/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/py/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/py/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/py/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/py/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/py/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/py/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/py/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/py/1.txt diff --git a/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/py/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/py/prompt.txt new file mode 100644 index 000000000..e69de29bb diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/qmljs/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/qmljs/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/qmljs/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/qmljs/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/qmljs/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/qmljs/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/qmljs/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/qmljs/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/qmljs/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/qmljs/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/qmljs/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/qmljs/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/qmljs/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/qmljs/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/qmljs/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/qmljs/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/rust/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/rust/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/rust/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/rust/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/rust/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/rust/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/rust/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/rust/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/rust/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/rust/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/rust/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/rust/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/rust/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/rust/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/rust/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/rust/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/scala/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/scala/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/scala/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/scala/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/scala/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/scala/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/scala/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/scala/1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/scala/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/scala/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/scala/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/scala/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/scala/example_languages_1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/scala/example_languages_1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/scala/example_languages_1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/scala/example_languages_1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/scala/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/scala/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/scala/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/scala/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/scala/prompt_1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/scala/prompt_1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/scala/prompt_1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/scala/prompt_1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/scala/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/scala/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/scala/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/scala/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/scala/test_code_1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/scala/test_code_1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/scala/test_code_1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/scala/test_code_1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/ts/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/ts/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/ts/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/ts/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/ts/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/ts/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/ts/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/ts/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/ts/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/ts/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/ts/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/ts/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/ts/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/ts/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/ts/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/ts/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/verilog/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/verilog/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/verilog/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/verilog/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/verilog/1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/verilog/1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/verilog/1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/verilog/1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/verilog/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/verilog/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/verilog/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/verilog/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/verilog/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/verilog/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/verilog/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/verilog/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/verilog/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/verilog/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/verilog/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/verilog/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/verilog/test_code_1.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/verilog/test_code_1.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/verilog/test_code_1.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/verilog/test_code_1.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/vhdl/0.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/vhdl/0.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/vhdl/0.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/vhdl/0.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/vhdl/example_languages.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/vhdl/example_languages.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/vhdl/example_languages.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/vhdl/example_languages.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/vhdl/prompt.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/vhdl/prompt.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/vhdl/prompt.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/vhdl/prompt.txt diff --git a/transforms/code/code_profiler/python/src/data/final_UI_outputs/package/vhdl/test_code.txt b/transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/vhdl/test_code.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/final_UI_outputs/package/vhdl/test_code.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/final_UI_outputs/package/vhdl/test_code.txt diff --git a/transforms/code/code_profiler/python/src/data/helper.ipynb b/transforms/code/code_profiler/dpk_code_profiler/data/helper.ipynb similarity index 100% rename from transforms/code/code_profiler/python/src/data/helper.ipynb rename to transforms/code/code_profiler/dpk_code_profiler/data/helper.ipynb diff --git a/transforms/code/code_profiler/python/src/data/prompts/comment.txt b/transforms/code/code_profiler/dpk_code_profiler/data/prompts/comment.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/prompts/comment.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/prompts/comment.txt diff --git a/transforms/code/code_profiler/python/src/data/prompts/function.txt b/transforms/code/code_profiler/dpk_code_profiler/data/prompts/function.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/prompts/function.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/prompts/function.txt diff --git a/transforms/code/code_profiler/python/src/data/prompts/package.txt b/transforms/code/code_profiler/dpk_code_profiler/data/prompts/package.txt similarity index 100% rename from transforms/code/code_profiler/python/src/data/prompts/package.txt rename to transforms/code/code_profiler/dpk_code_profiler/data/prompts/package.txt diff --git a/transforms/code/code_profiler/dpk_code_profiler/grammar/UAST_Grammar.json b/transforms/code/code_profiler/dpk_code_profiler/grammar/UAST_Grammar.json new file mode 100644 index 000000000..23bd5849a --- /dev/null +++ b/transforms/code/code_profiler/dpk_code_profiler/grammar/UAST_Grammar.json @@ -0,0 +1,20 @@ +{ + "uast_root": { + "keyword": "uast_root" + }, + "uast_package": { + "keyword": "uast_package" + }, + "uast_comment": { + "keyword": "uast_comment" + }, + "uast_class": { + "keyword": "uast_class" + }, + "uast_function": { + "keyword": "uast_function" + }, + "uast_call": { + "keyword": "uast_call" + } +} diff --git a/transforms/code/code_profiler/python/src/higher_order_concepts.py b/transforms/code/code_profiler/dpk_code_profiler/higher_order_concepts.py similarity index 89% rename from transforms/code/code_profiler/python/src/higher_order_concepts.py rename to transforms/code/code_profiler/dpk_code_profiler/higher_order_concepts.py index a33da1e19..7461556a3 100644 --- a/transforms/code/code_profiler/python/src/higher_order_concepts.py +++ b/transforms/code/code_profiler/dpk_code_profiler/higher_order_concepts.py @@ -12,6 +12,7 @@ from UAST import * + def extract_ccr(uast): """ Calculates the code to comment ratio given an UAST object as input @@ -20,9 +21,9 @@ def extract_ccr(uast): total_comment_loc = 0 for node_idx in uast.nodes: node = uast.get_node(node_idx) - if node.node_type == 'uast_comment': + if node.node_type == "uast_comment": total_comment_loc += node.metadata.get("loc_original_code", 0) - elif node.node_type == 'uast_root': + elif node.node_type == "uast_root": loc_snippet = node.metadata.get("loc_snippet", 0) if loc_snippet > 0: if total_comment_loc > 0: @@ -33,17 +34,18 @@ def extract_ccr(uast): return str(-1) return str(0) + def extract_code_snippet_length(uast): if uast is not None: for node_idx in uast.nodes: node = uast.get_node(node_idx) - if node.node_type == 'uast_root': + if node.node_type == "uast_root": loc_snippet = node.metadata.get("loc_snippet", 0) if loc_snippet > 0: return str(loc_snippet) else: return str(0) - + def extract_code_avg_fn_len_in_snippet(uast): if uast is not None: @@ -51,12 +53,11 @@ def extract_code_avg_fn_len_in_snippet(uast): fn_node_count = 0 for node_idx in uast.nodes: node = uast.get_node(node_idx) - if node.node_type == 'uast_function': + if node.node_type == "uast_function": total_fn_loc += node.metadata.get("loc_original_code", 0) - fn_node_count +=1 + fn_node_count += 1 if fn_node_count > 0: return str(round(float(total_fn_loc) / float(fn_node_count), 1)) else: return str(0) - diff --git a/transforms/code/code_profiler/python/src/code_profiler_local.py b/transforms/code/code_profiler/dpk_code_profiler/local.py similarity index 96% rename from transforms/code/code_profiler/python/src/code_profiler_local.py rename to transforms/code/code_profiler/dpk_code_profiler/local.py index 976bcd937..e61ccbce1 100644 --- a/transforms/code/code_profiler/python/src/code_profiler_local.py +++ b/transforms/code/code_profiler/dpk_code_profiler/local.py @@ -12,11 +12,11 @@ import json import os -import pyarrow.parquet as pq - +import pyarrow.parquet as pq from data_processing.data_access import DataAccessLocal -from code_profiler_transform import CodeProfilerTransform +from dpk_code_profiler.transform import CodeProfilerTransform + # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../", "../", "input")) @@ -31,6 +31,7 @@ # Print the loaded dictionary print("Loaded dictionary:", profiler_params) + def save_tables_to_parquet(table_list, output_folder, base_filename): """ Save each table in the table_list to individual parquet files in the specified output folder. @@ -45,6 +46,7 @@ def save_tables_to_parquet(table_list, output_folder, base_filename): pq.write_table(table, output_file) print(f"Table {idx} saved to {output_file}") + if __name__ == "__main__": print("Code profiling started") # Here we show how to run outside of the runtime @@ -63,4 +65,4 @@ def save_tables_to_parquet(table_list, output_folder, base_filename): print(f"\noutput table has {table_list[0].num_rows} rows and {table_list[0].num_columns} columns") print(f"output metadata : {metadata}") - save_tables_to_parquet(table_list, output_folder=output_folder, base_filename="uast_table") \ No newline at end of file + save_tables_to_parquet(table_list, output_folder=output_folder, base_filename="uast_table") diff --git a/transforms/code/code_profiler/python/src/code_profiler_local_python.py b/transforms/code/code_profiler/dpk_code_profiler/local_python.py similarity index 93% rename from transforms/code/code_profiler/python/src/code_profiler_local_python.py rename to transforms/code/code_profiler/dpk_code_profiler/local_python.py index 3da183863..8a70a23a4 100644 --- a/transforms/code/code_profiler/python/src/code_profiler_local_python.py +++ b/transforms/code/code_profiler/dpk_code_profiler/local_python.py @@ -15,7 +15,8 @@ from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils -from code_profiler_transform_python import CodeProfilerPythonTransformConfiguration +from dpk_code_profiler.transform_python import CodeProfilerPythonTransformConfiguration + # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../", "../", "input")) @@ -25,7 +26,7 @@ "input_folder": input_folder, "output_folder": output_folder, "contents": "contents", - "language": "language" + "language": "language", } params = { # Data access. Only required parameters are specified diff --git a/transforms/code/code_profiler/python/src/offline-customizations/cached_requirements.json b/transforms/code/code_profiler/dpk_code_profiler/offline-customizations/cached_requirements.json similarity index 52% rename from transforms/code/code_profiler/python/src/offline-customizations/cached_requirements.json rename to transforms/code/code_profiler/dpk_code_profiler/offline-customizations/cached_requirements.json index 1893b0724..c5a55524e 100644 --- a/transforms/code/code_profiler/python/src/offline-customizations/cached_requirements.json +++ b/transforms/code/code_profiler/dpk_code_profiler/offline-customizations/cached_requirements.json @@ -1,17 +1,12 @@ { "concept_to_node_map": { "py": { - "package": [ - "import_statement", - "import_from_statement" - ], + "package": ["import_statement", "import_from_statement"], "function": [ "function_definition", "function_definition' node with id = 1, represents the definition of a function in the code. Incorporating this node, I can make a general rule to extract the definitions.\n\nThis python script can be executed:\n\n```py\n# we see that the function name is directly before the argument list, hence we get the snippet just before the first bracket of the argument list.\ntemp_0 = code_snippet.split('(')[0].strip() \n# as our required function name, from the snippet is the last one in this string, we split and get the last snippet, which is our function.\nextracted = temp_0.split(' ')[-1].strip()\n```\n\nThis script will extract the function name 'foo' from the given code snippet." ], - "comment": [ - "comment" - ] + "comment": ["comment"] }, "cpp": { "package": [ @@ -19,245 +14,113 @@ "code snippet includes a package. Hence I consider the string after the first '#include'.\ntest = code_snippet.split('#include', 1)[1].strip()\n# In the case that there are any comments, we remove them.\ntest = test.split('//')[0].strip()\nextracted = test.split('/*')[0].strip()\n# Remove angle brackets and quotes\nextracted = extracted.replace('<', '').replace('>', '').replace('\"', '').replace(\"'\", '')\n# Remove semicolons and asterisks\nextracted = extracted.replace(';', '').replace('*', '')\nprint(extracted)\n```\n\nThis script will extract the imported packages from the code snippet, removing any comments, angle brackets, quotes, semicolons, and asterisks. The output will be:\n\n```\ncassert\nclimits\niostream\nvector\n```", "code snippet includes a package. Hence I consider the string after the first '#include'.\ntest = code_snippet.split('#include', 1)[1].strip()\n# In the case that there are any comments, we remove them.\ntest = test.split('//')[0].strip()\nextracted = test.split('/*')[0].strip()\n# Remove angle brackets and quotes\nextracted = extracted.replace('<', '').replace('>', '').replace('\"', '').replace(\"'\", '')\n# Remove semicolons and asterisks\nextracted = extracted.replace(';', '').replace('*', '')\nprint(extracted)\n```\n\nThis script will extract the imported packages from the code snippet, removing any comments, angle brackets, quotes, semicolons, and asterisks. The output will be:\n\n```\nvector\nsubstab\ncassert\nclimits\niostream\nvector\nvector\n```" ], - "function": [ - "function_declaration", - "function_definition" - ], - "comment": [ - "comment" - ] + "function": ["function_declaration", "function_definition"], + "comment": ["comment"] }, "java": { - "package": [ - "import_declaration" - ], - "function": [ - "method_declaration" - ], - "comment": [ - "line_comment", - "block_comment" - ] + "package": ["import_declaration"], + "function": ["method_declaration"], + "comment": ["line_comment", "block_comment"] }, "js": { - "package": [ - "import_statement" - ], - "function": [ - "function_declaration" - ], - "comment": [ - "comment" - ] + "package": ["import_statement"], + "function": ["function_declaration"], + "comment": ["comment"] }, "go": { - "package": [ - "import_declaration" - ], - "function": [ - "function_declaration" - ], - "comment": [ - "comment" - ] + "package": ["import_declaration"], + "function": ["function_declaration"], + "comment": ["comment"] }, "agda": { - "package": [ - "open" - ], - "function": [ - "function" - ], - "comment": [ - "comment" - ] + "package": ["open"], + "function": ["function"], + "comment": ["comment"] }, "c": { - "package": [ - "preproc_include" - ], - "function": [ - "function_definition" - ], - "comment": [ - "comment" - ] + "package": ["preproc_include"], + "function": ["function_definition"], + "comment": ["comment"] }, "c_sharp": { - "package": [ - "using_directive" - ], - "comment": [ - "comment" - ], - "function": [ - "local_function_statement" - ] + "package": ["using_directive"], + "comment": ["comment"], + "function": ["local_function_statement"] }, "d": { - "package": [ - "import_declaration" - ], - "function": [ - "function_declaration" - ], - "comment": [ - "comment" - ] + "package": ["import_declaration"], + "function": ["function_declaration"], + "comment": ["comment"] }, "dart": { - "package": [ - "import_or_export" - ], - "function": [ - "function_signature" - ], + "package": ["import_or_export"], + "function": ["function_signature"], "comment": [ "comment", "documentation_comment' node with id = 1, represents a comment in the code. Incorporating this node, I can make a general rule to extract the comments.\n\nThis python script can be executed:\n\n```py\n# if the first three characters are '///' we can simply remove the first three characters and get the remaining string\nif (code_snippet[0:3] == '///'):\n extracted = code_snippet[3:].strip()\n```\n\nThis script will extract the comment from the given code snippet." ] }, "elm": { - "package": [ - "import_clause" - ], - "function": [ - "function_declaration_left" - ], - "comment": [ - "line_comment", - "block_comment" - ] + "package": ["import_clause"], + "function": ["function_declaration_left"], + "comment": ["line_comment", "block_comment"] }, "haskell": { - "package": [ - "import" - ], - "function": [ - "function" - ], - "comment": [ - "comment" - ] + "package": ["import"], + "function": ["function"], + "comment": ["comment"] }, "kotlin": { - "package": [ - "import_header" - ], - "comment": [ - "multiline_comment", - "line_comment" - ], - "function": [ - "function_declaration" - ] + "package": ["import_header"], + "comment": ["multiline_comment", "line_comment"], + "function": ["function_declaration"] }, "nim": { - "package": [ - "import_statement", - "include_statement", - "import_from_statement" - ], - "comment": [ - "block_comment", - "comment" - ], - "function": [ - "proc_declaration" - ] + "package": ["import_statement", "include_statement", "import_from_statement"], + "comment": ["block_comment", "comment"], + "function": ["proc_declaration"] }, "objc": { - "package": [ - "preproc_import", - "preproc_include" - ], - "function": [ - "function_definition" - ], - "comment": [ - "comment" - ] + "package": ["preproc_import", "preproc_include"], + "function": ["function_definition"], + "comment": ["comment"] }, "ocaml": { - "package": [ - "open_module" - ], - "comment": [ - "comment" - ] + "package": ["open_module"], + "comment": ["comment"] }, "perl": { - "package": [ - "use_no_statement" - ], - "function": [ - "function_definition" - ] + "package": ["use_no_statement"], + "function": ["function_definition"] }, "qmljs": { - "package": [ - "ui_import" - ], - "comment": [ - "comment" - ] + "package": ["ui_import"], + "comment": ["comment"] }, "rust": { - "package": [ - "use_declaration" - ], - "function": [ - "function_item" - ], - "comment": [ - "line_comment" - ] + "package": ["use_declaration"], + "function": ["function_item"], + "comment": ["line_comment"] }, "scala": { - "package": [ - "import_declaration" - ], - "comment": [ - "comment", - "block_comment" - ], - "function": [ - "function_definition" - ] + "package": ["import_declaration"], + "comment": ["comment", "block_comment"], + "function": ["function_definition"] }, "ts": { - "package": [ - "import_statement" - ], - "comment": [ - "comment" - ], - "function": [ - "function_declaration" - ] + "package": ["import_statement"], + "comment": ["comment"], + "function": ["function_declaration"] }, "verilog": { - "package": [ - "package_or_generate_item_declaration", - "include_compiler_directive" - ], - "comment": [ - "comment" - ], - "function": [ - "function_identifier" - ] + "package": ["package_or_generate_item_declaration", "include_compiler_directive"], + "comment": ["comment"], + "function": ["function_identifier"] }, "vhdl": { - "package": [ - "library_clause" - ], - "comment": [ - "comment" - ], - "function": [ - "function_body" - ] + "package": ["library_clause"], + "comment": ["comment"], + "function": ["function_body"] } }, "formal_language_example_map": { @@ -332,4 +195,4 @@ "Llama 3 Instruct: 80b": "meta-llama/llama-3-70b-instruct", "Granite Code Instruct: 34b": "ibm/granite-34b-code-instruct" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/offline-customizations/config_LLM_runner_app.py b/transforms/code/code_profiler/dpk_code_profiler/offline-customizations/config_LLM_runner_app.py similarity index 82% rename from transforms/code/code_profiler/python/src/offline-customizations/config_LLM_runner_app.py rename to transforms/code/code_profiler/dpk_code_profiler/offline-customizations/config_LLM_runner_app.py index 79fef11e6..469b5a636 100644 --- a/transforms/code/code_profiler/python/src/offline-customizations/config_LLM_runner_app.py +++ b/transforms/code/code_profiler/dpk_code_profiler/offline-customizations/config_LLM_runner_app.py @@ -10,13 +10,12 @@ # limitations under the License. ################################################################################ -LLM_PROVIDER="IBM-Watsonx.ai" # OPTIONS:[OpenAI, IBM-Watsonx.ai] +LLM_PROVIDER = "IBM-Watsonx.ai" # OPTIONS:[OpenAI, IBM-Watsonx.ai] -API_KEY = "r0il8aWXUvyCyDL9E6FAofeYdL-7vzsplAzkp9L0iktf" +API_KEY = "" API_ENDPOINT = "https://us-south.ml.cloud.ibm.com" MODEL_ID = "meta-llama/llama-3-70b-instruct" PROMPT_NAME = "My-prompt" # Watsonx Specific configuration -PROJECT_ID = "ba1b3e6d-5e38-4c72-9c36-4a9470cea282" - +PROJECT_ID = "" diff --git a/transforms/code/code_profiler/python/src/offline-customizations/generic_LLM_runner_app.py b/transforms/code/code_profiler/dpk_code_profiler/offline-customizations/generic_LLM_runner_app.py similarity index 62% rename from transforms/code/code_profiler/python/src/offline-customizations/generic_LLM_runner_app.py rename to transforms/code/code_profiler/dpk_code_profiler/offline-customizations/generic_LLM_runner_app.py index a8f7edb9b..21dedadaa 100644 --- a/transforms/code/code_profiler/python/src/offline-customizations/generic_LLM_runner_app.py +++ b/transforms/code/code_profiler/dpk_code_profiler/offline-customizations/generic_LLM_runner_app.py @@ -10,20 +10,28 @@ # limitations under the License. ################################################################################ -from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams -from ibm_watsonx_ai.foundation_models import ModelInference -from ibm_watsonx_ai import Credentials -from tree_sitter import Parser, Language -import json -from tree_sitter_languages import get_language import glob +import json import os -import openai +import re from time import sleep + +import openai import streamlit as st from annotated_text import annotated_text -import re -from config_LLM_runner_app import API_ENDPOINT, API_KEY, PROJECT_ID, MODEL_ID, LLM_PROVIDER +from config_LLM_runner_app import ( + API_ENDPOINT, + API_KEY, + LLM_PROVIDER, + MODEL_ID, + PROJECT_ID, +) +from ibm_watsonx_ai import Credentials +from ibm_watsonx_ai.foundation_models import ModelInference +from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams +from tree_sitter import Language, Parser +from tree_sitter_languages import get_language + # Flag to dictate if it is concept-level pruning GET_CONCEPTS_ONLY = False @@ -31,7 +39,7 @@ TEXT_TEST_CONCEPT = False # Initialize the LLM client -if 'client' not in st.session_state: +if "client" not in st.session_state: if LLM_PROVIDER == "IBM-Watsonx.ai": # Set up IBM Watsonx.ai credentials credentials = Credentials(api_key=API_KEY, url=API_ENDPOINT) @@ -41,11 +49,11 @@ GenParams.MIN_NEW_TOKENS: 1, } # Initialize the IBM Watsonx.ai model - st.session_state['client'] = ModelInference( + st.session_state["client"] = ModelInference( model_id=MODEL_ID, params=parameters, credentials=credentials, - project_id=PROJECT_ID + project_id=PROJECT_ID, ) elif LLM_PROVIDER == "OpenAI": # Set up OpenAI API key @@ -55,21 +63,27 @@ st.stop() # Load the cached requirements -if 'cached_requirements' not in st.session_state: - st.session_state['cached_requirements'] = json.load(open('cached_requirements.json', 'r')) +if "cached_requirements" not in st.session_state: + st.session_state["cached_requirements"] = json.load(open("cached_requirements.json", "r")) # Load the necessary maps -formal_language_example_map = st.session_state['cached_requirements']['formal_language_example_map'] -formal_language_map = st.session_state['cached_requirements']['formal_language_map'] -formal_concept_map = st.session_state['cached_requirements']['formal_concept_map'] -formal_model_card_map = st.session_state['cached_requirements']['formal_model_card_map'] -concept_to_node_map = st.session_state['cached_requirements']['concept_to_node_map'] +formal_language_example_map = st.session_state["cached_requirements"]["formal_language_example_map"] +formal_language_map = st.session_state["cached_requirements"]["formal_language_map"] +formal_concept_map = st.session_state["cached_requirements"]["formal_concept_map"] +formal_model_card_map = st.session_state["cached_requirements"]["formal_model_card_map"] +concept_to_node_map = st.session_state["cached_requirements"]["concept_to_node_map"] # Option to select the few-shot examples -example_languages = st.sidebar.multiselect("Select the known languages to give few-shot examples", list(formal_language_example_map.keys())) +example_languages = st.sidebar.multiselect( + "Select the known languages to give few-shot examples", + list(formal_language_example_map.keys()), +) # Option to choose the test language -test_language = st.sidebar.selectbox("Select the unknown language you want to test", list(set(formal_language_map.keys()) - set(example_languages))) +test_language = st.sidebar.selectbox( + "Select the unknown language you want to test", + list(set(formal_language_map.keys()) - set(example_languages)), +) # Option to select the input method test_method = st.sidebar.selectbox("How do you want to test?", ["Local Files", "User Input"]) @@ -89,7 +103,7 @@ test_concept = st.sidebar.selectbox("Select the UAST concept you want to extract", list(formal_concept_map.keys())) # Get the current few-shot examples present within the data -present_examples = os.listdir('../data/few_shot_outputs/') +present_examples = os.listdir("../data/few_shot_outputs/") # File numbers are important as there can be multiple relevant nodes test_file_num = 0 @@ -98,17 +112,20 @@ model = st.sidebar.selectbox("Select the model you want to run the query on", list(formal_model_card_map.keys())) # Choose the pruning method -pruning_method = st.sidebar.selectbox("Select the pruning method to apply to the example ASTs", ["Concept-Level Pruning", "No Pruning", "Depth-Level Pruning"]) +pruning_method = st.sidebar.selectbox( + "Select the pruning method to apply to the example ASTs", + ["Concept-Level Pruning", "No Pruning", "Depth-Level Pruning"], +) # Set to infinity for No-pruning -max_depth = float('inf') +max_depth = float("inf") # Set flags and depth levels for different techniques if pruning_method == "Depth-Level Pruning": - max_depth = st.sidebar.slider('Select the pruning depth of the AST', min_value=1, max_value=5, value=3) + max_depth = st.sidebar.slider("Select the pruning depth of the AST", min_value=1, max_value=5, value=3) elif pruning_method == "Concept-Level Pruning": GET_CONCEPTS_ONLY = True - max_depth = st.sidebar.slider('Select the pruning depth of the test AST', min_value=1, max_value=5, value=3) + max_depth = st.sidebar.slider("Select the pruning depth of the test AST", min_value=1, max_value=5, value=3) # Few-shot example languages example_languages = [formal_language_map[lang] for lang in example_languages] @@ -123,30 +140,35 @@ model = formal_model_card_map[model] # Map to store number of present examples -if 'number_of_examples' not in st.session_state: - st.session_state['number_of_examples'] = dict() +if "number_of_examples" not in st.session_state: + st.session_state["number_of_examples"] = dict() # Save in session state -st.session_state['Languages'] = example_languages +st.session_state["Languages"] = example_languages # If it's to fetch from local storage, append the test to the example languages if not TEXT_TEST_CONCEPT: - st.session_state['Languages'] = example_languages + [test_language] + st.session_state["Languages"] = example_languages + [test_language] """ Function to convert and AST node into a string with requiring only relevant data. Requires the ID of the node, the node type, the code snippet and the parent id. """ + + def create_node(id, node, parent_id): req_string = f"< node_id = {id}, node_type = {node.type}, code_snippet = {repr(node.text.decode('utf8'))}, parent_id = {parent_id} >" return req_string + """ Function to recursively assign ID and preprocess the AST in a concept-level pruning manner to get it into a parse-able format to pass to the LLM. dfs_id() function allocates a unique ID on preorder traversal basis to the treenode. _dfs() function recursively parses the tree to the relevant node, while storing the code snippet relevant to a unique ID node. """ + + def get_concept_tree(tree, language): ast_repr = [] code_snippets = dict() @@ -158,9 +180,9 @@ def dfs_id(node): dfs_id(child) dfs_id(tree.root_node) - + def _dfs(node, parent): - if (node.type in concept_to_node_map[language][test_concept]): + if node.type in concept_to_node_map[language][test_concept]: ast_repr.append(create_node(id_dictionary[node], node, id_dictionary[parent])) code_snippets[id_dictionary[node]] = node.text.decode("utf8") for child in node.children: @@ -168,7 +190,7 @@ def _dfs(node, parent): for child in tree.root_node.children: _dfs(child, tree.root_node) - + return ast_repr, code_snippets @@ -177,6 +199,8 @@ def _dfs(node, parent): dfs_id() function allocates a unique ID on preorder traversal basis to the treenode. _dfs() function recursively parses the tree to the relevant node, while storing the code snippet relevant to a unique ID node. """ + + def get_tree(tree, k): ast_repr = [] code_snippets = dict() @@ -188,9 +212,9 @@ def dfs_id(node): dfs_id(child) dfs_id(tree.root_node) - + def _dfs(node, depth, parent): - if (depth >= k): + if depth >= k: return ast_repr.append(create_node(id_dictionary[node], node, id_dictionary[parent])) code_snippets[id_dictionary[node]] = node.text.decode("utf8") @@ -200,34 +224,35 @@ def _dfs(node, depth, parent): # _dfs(tree.root_node, -1, tree.root_node) for child in tree.root_node.children: _dfs(child, 0, tree.root_node) - + return ast_repr, code_snippets + # initialise an AST parser. parser = Parser() -# use bindings from tree_sitter_language library. -if 'language_binding' not in st.session_state: - st.session_state['language_binding'] = { - "cpp" : get_language("cpp"), - "py" : get_language('python'), - "java" : get_language("java"), - "go" : get_language("go"), - "js" : get_language("javascript"), - "ts" : get_language("typescript"), - "perl" : get_language("perl"), - "php" : get_language("php"), - "ocaml" : get_language("ocaml") +# use bindings from tree_sitter_language library. +if "language_binding" not in st.session_state: + st.session_state["language_binding"] = { + "cpp": get_language("cpp"), + "py": get_language("python"), + "java": get_language("java"), + "go": get_language("go"), + "js": get_language("javascript"), + "ts": get_language("typescript"), + "perl": get_language("perl"), + "php": get_language("php"), + "ocaml": get_language("ocaml"), } - RUNTIME_HOST_ARCH = os.environ.get('RUNTIME_HOST_ARCH', 'x86_64') - BINDINGS_DIR = os.path.join('..', 'tree-sitter-bindings', RUNTIME_HOST_ARCH) + RUNTIME_HOST_ARCH = os.environ.get("RUNTIME_HOST_ARCH", "x86_64") + BINDINGS_DIR = os.path.join("..", "tree-sitter-bindings", RUNTIME_HOST_ARCH) # uising the normal tree-sitter bindings locally for the laguages present in the cached_requirements json. for binding in os.listdir(BINDINGS_DIR): print(binding) - name = binding.split('-bindings', 1)[0] + name = binding.split("-bindings", 1)[0] # print(name) - if name in st.session_state['language_binding']: + if name in st.session_state["language_binding"]: continue try: language_path = os.path.join(BINDINGS_DIR, binding) @@ -236,40 +261,46 @@ def _dfs(node, depth, parent): print(e) print(name) exit() - st.session_state['language_binding'][name] = language_obj + st.session_state["language_binding"][name] = language_obj -#initialize session states to contain all the outputs. -if 'all_few_shot_outputs' not in st.session_state: - st.session_state['all_few_shot_outputs'] = dict() +# initialize session states to contain all the outputs. +if "all_few_shot_outputs" not in st.session_state: + st.session_state["all_few_shot_outputs"] = dict() -if 'all_asts' not in st.session_state: - st.session_state['all_asts'] = dict() +if "all_asts" not in st.session_state: + st.session_state["all_asts"] = dict() -if 'all_code_snippets' not in st.session_state: - st.session_state['all_code_snippets'] = dict() +if "all_code_snippets" not in st.session_state: + st.session_state["all_code_snippets"] = dict() -if 'all_concept_code_json' not in st.session_state: - st.session_state['all_concept_code_json'] = dict() +if "all_concept_code_json" not in st.session_state: + st.session_state["all_concept_code_json"] = dict() # get all the few_shot LLM output examples present locally def get_all_few_shot(example_languages, test_concept, language): for language in example_languages: programs = os.listdir(f"../data/few_shot_outputs/uast_{test_concept}/{language}") - names = [os.path.basename(file).split('.')[0] for file in programs] + names = [os.path.basename(file).split(".")[0] for file in programs] for i in range(len(programs)): - if (language not in st.session_state['all_few_shot_outputs']): - st.session_state['all_few_shot_outputs'][language] = dict() + if language not in st.session_state["all_few_shot_outputs"]: + st.session_state["all_few_shot_outputs"][language] = dict() + + content = open( + f"../data/few_shot_outputs/uast_{test_concept}/{language}/{programs[i]}", + "r", + ).read() + st.session_state["all_few_shot_outputs"][language][names[i]] = content - content = open(f"../data/few_shot_outputs/uast_{test_concept}/{language}/{programs[i]}", "r").read() - st.session_state['all_few_shot_outputs'][language][names[i]] = content """ get all the few_shot code examples present locally and their corresponding AST with given max depth. This function also calls the AST preprocessor to store it in a global dictionary to retrieve in one step. """ -def get_all_asts_code(test_concept, max_depth = 0): - for language in st.session_state['Languages']: - parser.set_language(st.session_state['language_binding'][language]) + + +def get_all_asts_code(test_concept, max_depth=0): + for language in st.session_state["Languages"]: + parser.set_language(st.session_state["language_binding"][language]) # Define the directory path dir_path = f"../data/Concept_dataset/uast_{test_concept}/{language}" # Check if the directory exists @@ -281,193 +312,224 @@ def get_all_asts_code(test_concept, max_depth = 0): if not programs: print(f"No programs found for concept '{test_concept}' in language '{language}'. Skipping.") continue # Skip if the directory is empty - names = [os.path.basename(file).split('.')[0] for file in programs] - st.session_state['number_of_examples'][language] = len(programs) + names = [os.path.basename(file).split(".")[0] for file in programs] + st.session_state["number_of_examples"][language] = len(programs) for i in range(len(programs)): - if (language not in st.session_state['all_asts']): - st.session_state['all_asts'][language] = dict() - st.session_state['all_code_snippets'][language] = dict() - st.session_state['all_concept_code_json'][language] = dict() - - content = open(f"../data/Concept_dataset/uast_{test_concept}/{language}/{programs[i]}", "r").read() - st.session_state['all_code_snippets'][language][names[i]] = content + if language not in st.session_state["all_asts"]: + st.session_state["all_asts"][language] = dict() + st.session_state["all_code_snippets"][language] = dict() + st.session_state["all_concept_code_json"][language] = dict() + + content = open( + f"../data/Concept_dataset/uast_{test_concept}/{language}/{programs[i]}", + "r", + ).read() + st.session_state["all_code_snippets"][language][names[i]] = content ast = parser.parse(bytes(content, "utf8")) all_ast, all_code = None, None - if (GET_CONCEPTS_ONLY and (language != test_language)): + if GET_CONCEPTS_ONLY and (language != test_language): all_ast, all_code = get_concept_tree(ast, language) else: - all_ast, all_code = get_tree(ast, max_depth) - st.session_state['all_asts'][language][names[i]] = str(all_ast) - st.session_state['all_concept_code_json'][language][names[i]] = all_code + all_ast, all_code = get_tree(ast, max_depth) + st.session_state["all_asts"][language][names[i]] = str(all_ast) + st.session_state["all_concept_code_json"][language][names[i]] = all_code + """ get all the corresponding AST with given max depth of the given text-input. This function also calls the AST preprocessor to store it in a global dictionary to retrieve in one step. """ + + def get_text_test_example(language, test_code_snippet): - parser.set_language(st.session_state['language_binding'][language]) - if (language not in st.session_state['all_asts']): - st.session_state['all_asts'][language] = dict() - st.session_state['all_code_snippets'][language] = dict() - st.session_state['all_concept_code_json'][language] = dict() - st.session_state['all_code_snippets'][language]['0'] = test_code_snippet + parser.set_language(st.session_state["language_binding"][language]) + if language not in st.session_state["all_asts"]: + st.session_state["all_asts"][language] = dict() + st.session_state["all_code_snippets"][language] = dict() + st.session_state["all_concept_code_json"][language] = dict() + st.session_state["all_code_snippets"][language]["0"] = test_code_snippet ast = parser.parse(bytes(test_code_snippet, "utf8")) all_ast, all_code = get_tree(ast, max_depth) - st.session_state['all_asts'][language]['0'] = str(all_ast) - st.session_state['all_concept_code_json'][language]['0'] = all_code - + st.session_state["all_asts"][language]["0"] = str(all_ast) + st.session_state["all_concept_code_json"][language]["0"] = all_code + + # load the prompt for the concept category_prompt_file = f"../data/prompts/{test_concept}.txt" -st.session_state['prompt'] = open(category_prompt_file, "r").read() +st.session_state["prompt"] = open(category_prompt_file, "r").read() + def example_builder(lang, program_num): - if lang not in st.session_state['all_code_snippets']: + if lang not in st.session_state["all_code_snippets"]: print(f"No code snippets available for language '{lang}'. Skipping.") return None # Return None to indicate missing data - if str(program_num) not in st.session_state['all_code_snippets'][lang]: + if str(program_num) not in st.session_state["all_code_snippets"][lang]: print(f"No code snippet for program number '{program_num}' in language '{lang}'. Skipping.") return None - if lang not in st.session_state['all_asts'] or str(program_num) not in st.session_state['all_asts'][lang]: + if lang not in st.session_state["all_asts"] or str(program_num) not in st.session_state["all_asts"][lang]: print(f"No AST available for program number '{program_num}' in language '{lang}'. Skipping.") return None return f"\n{st.session_state['all_code_snippets'][lang][str(program_num)]}\n\n\n{st.session_state['all_asts'][lang][str(program_num)]}" + # get the fewshot examples in a pluggable form to the LLM. def get_few_shot(): few_shot_examples = [] for lang in example_languages: - for program_num in range(st.session_state['number_of_examples'][lang]): + for program_num in range(st.session_state["number_of_examples"][lang]): few_shot_examples.append( { - "input" : f"{example_builder(lang, program_num)}", - "output" : f"{st.session_state['all_few_shot_outputs'][lang][str(program_num)]}" + "input": f"{example_builder(lang, program_num)}", + "output": f"{st.session_state['all_few_shot_outputs'][lang][str(program_num)]}", } ) return few_shot_examples + # call funtions to get all such examples, codes and ASTs. get_all_asts_code(test_concept, max_depth) get_all_few_shot(example_languages, test_concept, test_language) st.markdown("### Enter prompt here") # Make a modifiable prompt -st.session_state['prompt'] = st.text_area("prompt", st.session_state['prompt'], height=700, label_visibility="collapsed") +st.session_state["prompt"] = st.text_area( + "prompt", st.session_state["prompt"], height=700, label_visibility="collapsed" +) # If it's text-based, call the function to get the AST if TEXT_TEST_CONCEPT: get_text_test_example(test_language, test_code_snippet) -st.session_state['test_input'] = f"{example_builder(test_language, '0')}" +st.session_state["test_input"] = f"{example_builder(test_language, '0')}" # Display the few-shot examples JSON -st.write('Training examples:') +st.write("Training examples:") st.write(get_few_shot()) # Display the test JSON st.write("Test example:") -st.write([st.session_state['test_input']]) +st.write([st.session_state["test_input"]]) """ function to extract rule from the response. This works because of LLM alignment to generate response in a format, with the help of few-shot examples. """ + + def get_rule_py(output_text): - content = output_text.split('```py', 1)[1].split('```', 1)[0].strip() + content = output_text.split("```py", 1)[1].split("```", 1)[0].strip() return content + """ function to extract node type from the response. This works because of LLM alignment to generate response in a format, with the help of few-shot examples. """ + + def extract_node_type(output_text): - content = output_text.split('see that the', 1)[1].split('nodes', 1)[0].strip() - return content.strip('\'"') + content = output_text.split("see that the", 1)[1].split("nodes", 1)[0].strip() + return content.strip("'\"") + """ function to extract IDs of all the relevant nodes from the response. Returns a list of relevant node IDs. This works because of LLM alignment to generate response in a format, with the help of few-shot examples. """ + + def extract_node_id(output_text): content = None try: - content = output_text.split('with ids = [', 1)[1].split(']', 1)[0].strip() + content = output_text.split("with ids = [", 1)[1].split("]", 1)[0].strip() except: try: - content = output_text.split('with id = ', 1)[1].split(',', 1)[0].strip() + content = output_text.split("with id = ", 1)[1].split(",", 1)[0].strip() except: st.write("cant be extracted") - - if (',') not in content: + + if (",") not in content: return [int(content)] - - id_strings = content.split(',') + + id_strings = content.split(",") return [int(id.strip()) for id in id_strings] + """ function to save the output generated by the LLM. """ -def save_rule(language, node_type, rule, prompt, output, concept, ruleset_path, example_path, example_languages, test_code, max_depth): + + +def save_rule( + language, + node_type, + rule, + prompt, + output, + concept, + ruleset_path, + example_path, + example_languages, + test_code, + max_depth, +): ruleset_files = os.listdir(ruleset_path) print(ruleset_files) # if the file is already present then just add a new mapping from the relevant node type to its corresponding rule. - if (f'UAST_rules_{language}.json' in ruleset_files): - rule_dict = json.load(open(f'{ruleset_path}/UAST_rules_{language}.json', 'r')) - rule_dict[node_type] = { - "uast_node_type": f"uast_{concept}", - "extractor": rule - } + if f"UAST_rules_{language}.json" in ruleset_files: + rule_dict = json.load(open(f"{ruleset_path}/UAST_rules_{language}.json", "r")) + rule_dict[node_type] = {"uast_node_type": f"uast_{concept}", "extractor": rule} # if it is not, then make a new dictionary with the same. else: - rule_dict = { - node_type : { - "uast_node_type": f"uast_{concept}", - "extractor": rule - } - } + rule_dict = {node_type: {"uast_node_type": f"uast_{concept}", "extractor": rule}} - print("saving rule for",language) - try: + print("saving rule for", language) + try: try: # try to save the rule dictionary - json.dump(rule_dict, open(f'{ruleset_path}/UAST_rules_{language}.json', 'w'), indent = 4) + json.dump( + rule_dict, + open(f"{ruleset_path}/UAST_rules_{language}.json", "w"), + indent=4, + ) print("json saved") except Exception as e: - print("could not save rule JSON :", end = " ") + print("could not save rule JSON :", end=" ") print(e) # make the directory to save the output. - os.makedirs(example_path + '/' + concept + '/' + language, exist_ok= True) + os.makedirs(example_path + "/" + concept + "/" + language, exist_ok=True) files_present = os.listdir(f"{example_path}/{concept}/{language}") - + # loop to check already present files. This is because of multiple relevant nodes. counter = 0 - while(f"{counter}.txt" in files_present): + while f"{counter}.txt" in files_present: counter += 1 # saving the LLM output, input code, few-shot languages and the prompt. with open(f"{example_path}/{concept}/{language}/{counter}.txt", "w") as f: f.write(output) - + with open(f"{example_path}/{concept}/{language}/prompt_{counter}.txt", "w") as f: f.write(prompt) - + with open(f"{example_path}/{concept}/{language}/example_languages_{counter}.txt", "w") as f: - f.write(str(example_languages) + '\n' + 'max_depth = '+ str(max_depth)) - + f.write(str(example_languages) + "\n" + "max_depth = " + str(max_depth)) + with open(f"{example_path}/{concept}/{language}/test_code_{counter}.txt", "w") as f: f.write(test_code) - - os.makedirs(f"../data/few_shot_outputs/uast_{concept}/{language}", exist_ok= True) - os.makedirs(f"../data/Concept_dataset/uast_{concept}/{language}", exist_ok= True) + + os.makedirs(f"../data/few_shot_outputs/uast_{concept}/{language}", exist_ok=True) + os.makedirs(f"../data/Concept_dataset/uast_{concept}/{language}", exist_ok=True) # save the output as another few-shot example. with open(f"../data/few_shot_outputs/uast_{concept}/{language}/{counter}.txt", "w") as f: f.write(output) - + with open(f"../data/Concept_dataset/uast_{concept}/{language}/{counter}.txt", "w") as f: f.write(test_code) - + # if everything is successful, display balloons on the screen!. st.balloons() print("Voila! prompt worked!") @@ -475,53 +537,64 @@ def save_rule(language, node_type, rule, prompt, output, concept, ruleset_path, print("COULD NOT SAVE FOR", language, "because :", e) # add concept nodes in the cached_requirements and save it. - if (concept in st.session_state['cached_requirements']['concept_to_node_map'][language]) : - if (node_type not in st.session_state['cached_requirements']['concept_to_node_map'][language][concept]): - st.session_state['cached_requirements']['concept_to_node_map'][language][concept].append(node_type) - else : - st.session_state['cached_requirements']['concept_to_node_map'][language][concept] = [node_type] - + if concept in st.session_state["cached_requirements"]["concept_to_node_map"][language]: + if node_type not in st.session_state["cached_requirements"]["concept_to_node_map"][language][concept]: + st.session_state["cached_requirements"]["concept_to_node_map"][language][concept].append(node_type) + else: + st.session_state["cached_requirements"]["concept_to_node_map"][language][concept] = [node_type] + + concept_to_node_map = st.session_state["cached_requirements"]["concept_to_node_map"] + json.dump( + st.session_state["cached_requirements"], + open("cached_requirements.json", "w"), + indent=4, + ) - concept_to_node_map = st.session_state['cached_requirements']['concept_to_node_map'] - json.dump(st.session_state['cached_requirements'], open("cached_requirements.json", "w"), indent= 4) # remove new-line comments frmo the code that the LLM generates. This is done to reduce memory consumption, as the output is saved already for documentation purposes. def remove_comments(text): - return re.sub(r"^(#.*?$)\n", "", text, flags = re.MULTILINE) + return re.sub(r"^(#.*?$)\n", "", text, flags=re.MULTILINE) + # change the extracted keyword to self.extracted keyword to make it work for the parser. def process_rule(text): return remove_comments(text).replace("extracted", "self.extracted") + # function to enable stream generation through yielding tokens. response = None + + def stream_data(): for token in response: yield token.results[0].generated_text + def build_prompt(): - prompt = st.session_state['prompt'] + "\n\n" + prompt = st.session_state["prompt"] + "\n\n" examples = get_few_shot() for example in examples: prompt += "Input:\n" - prompt += example['input'] + "\n" + prompt += example["input"] + "\n" prompt += "Output:\n" - prompt += example['output'] + "\n\n" + prompt += example["output"] + "\n\n" prompt += "Input:\n" - prompt += st.session_state['test_input'] + "\n" + prompt += st.session_state["test_input"] + "\n" prompt += "Output:\n" # The model is expected to generate the output here return prompt + + # If the submit button is clicked, perform the subsequent operations -if st.sidebar.button('Submit'): +if st.sidebar.button("Submit"): # Build the prompt prompt_text = build_prompt() # Invoke the query to the LLM - with st.spinner('Language model is working ...'): + with st.spinner("Language model is working ..."): if LLM_PROVIDER == "IBM-Watsonx.ai": - response = st.session_state['client'].generate_text(prompt_text) + response = st.session_state["client"].generate_text(prompt_text) ans = response # IBM Watsonx.ai returns the generated text directly elif LLM_PROVIDER == "OpenAI": try: @@ -530,16 +603,16 @@ def build_prompt(): engine="text-davinci-003", prompt=prompt_text, max_tokens=1024, - temperature=0 + temperature=0, ) - ans = openai_response['choices'][0]['text'] + ans = openai_response["choices"][0]["text"] except Exception as e: st.error(f"An error occurred with OpenAI: {e}") st.stop() - st.markdown('### Response:') + st.markdown("### Response:") st.write(ans) - st.write('----------------------------------------------') + st.write("----------------------------------------------") # Extract the nodes and IDs nodes = extract_node_id(ans) @@ -549,8 +622,7 @@ def build_prompt(): # Get the relevant code snippets from the IDs it extracted code_snippets = [ - st.session_state['all_concept_code_json'][test_language][str(test_file_num)][node] - for node in nodes + st.session_state["all_concept_code_json"][test_language][str(test_file_num)][node] for node in nodes ] extracted = None @@ -558,10 +630,10 @@ def build_prompt(): for i in range(len(code_snippets)): code_snippet = code_snippets[i] exec(rule) - st.write(f'For Node with ID = {nodes[i]} and code') - st.write(f'```{test_language}\n{code_snippet}') - annotated_text('The extracted part is', (extracted, '', 'rgba(10,50,170,0.5)')) - st.write('----------------------------------------------') + st.write(f"For Node with ID = {nodes[i]} and code") + st.write(f"```{test_language}\n{code_snippet}") + annotated_text("The extracted part is", (extracted, "", "rgba(10,50,170,0.5)")) + st.write("----------------------------------------------") # One-click acceptance of rule st.sidebar.button( @@ -571,13 +643,13 @@ def build_prompt(): test_language, extract_node_type(ans), process_rule(rule), - st.session_state['prompt'], + st.session_state["prompt"], ans, test_concept, "../ruleset", "../data/final_UI_outputs", example_languages, - st.session_state['all_code_snippets'][test_language]['0'], - max_depth - ] - ) \ No newline at end of file + st.session_state["all_code_snippets"][test_language]["0"], + max_depth, + ], + ) diff --git a/transforms/code/code_profiler/python/src/profiler-report/template.html b/transforms/code/code_profiler/dpk_code_profiler/profiler-report/template.html similarity index 100% rename from transforms/code/code_profiler/python/src/profiler-report/template.html rename to transforms/code/code_profiler/dpk_code_profiler/profiler-report/template.html diff --git a/transforms/code/code_profiler/python/src/profiler_report.py b/transforms/code/code_profiler/dpk_code_profiler/profiler_report.py similarity index 74% rename from transforms/code/code_profiler/python/src/profiler_report.py rename to transforms/code/code_profiler/dpk_code_profiler/profiler_report.py index 75dae8320..6bbfa0f87 100644 --- a/transforms/code/code_profiler/python/src/profiler_report.py +++ b/transforms/code/code_profiler/dpk_code_profiler/profiler_report.py @@ -10,26 +10,29 @@ # limitations under the License. ################################################################################ +import json import os -import numpy as np +import socket from collections import Counter -from jinja2 import Environment, FileSystemLoader -import pyarrow as pa +from datetime import datetime +from pathlib import Path + +import numpy as np import plotly.graph_objects as go +import pyarrow as pa +from jinja2 import Environment, FileSystemLoader from plotly.io import to_html -from pathlib import Path -import json -import socket -from datetime import datetime -base_constructs = ['UAST_Package_List', 'language', 'Concepts'] +base_constructs = ["UAST_Package_List", "language", "Concepts"] + class Plot: - ''' + """ Plot class implements the generation of frequency distribution plots of the various components of the profiler report. Given a pyarrow table and a column name, it generates the corresponding plot. - ''' + """ + def __init__(self, table, column_name): self.table = table self.column_name = column_name @@ -39,8 +42,8 @@ def _get_column_data(self): column_data = self.table[self.column_name].to_numpy() split_data = [] for value in column_data: - if isinstance(value, str) and ',' in value: - split_data.extend(value.split(',')) + if isinstance(value, str) and "," in value: + split_data.extend(value.split(",")) else: split_data.append(value) return np.array([item.strip() if isinstance(item, str) else item for item in split_data]) @@ -49,61 +52,60 @@ def generate_distribution_plot(self): data = self.column_data fig = go.Figure() cleaned_data = [item for item in data if item is not None] - fig.add_trace(go.Histogram(x=cleaned_data, nbinsx=len(np.unique(cleaned_data)), opacity=0.7, marker=dict(color='blue', line=dict(width=1, color='black')))) + fig.add_trace( + go.Histogram( + x=cleaned_data, + nbinsx=len(np.unique(cleaned_data)), + opacity=0.7, + marker=dict(color="blue", line=dict(width=1, color="black")), + ) + ) # fig.add_trace(go.Histogram(x=data, nbinsx=len(np.unique(data)), opacity=0.7, marker=dict(color='blue', line=dict(width=1, color='black')))) fig.update_layout( - width=500, - height=300, - title=dict( - text=f'Distribution of {self.column_name}', - font=dict(size=14) - ), - xaxis=dict( - title='Value', - title_font=dict(size=12), - tickfont=dict(size=10) - ), - yaxis=dict( - title='Frequency', - title_font=dict(size=12), - tickfont=dict(size=10) - ), - bargap=0.1 + width=500, + height=300, + title=dict(text=f"Distribution of {self.column_name}", font=dict(size=14)), + xaxis=dict(title="Value", title_font=dict(size=12), tickfont=dict(size=10)), + yaxis=dict(title="Frequency", title_font=dict(size=12), tickfont=dict(size=10)), + bargap=0.1, ) return to_html(fig, full_html=False) class Report: - ''' + """ Generates the report containing the distribution of various syntactic and semantic components. - ''' + """ + def __init__(self, template_file: str): path = Path(template_file) - directory = path.parent - file_name = path.name + directory = path.parent + file_name = path.name self.env = Environment(loader=FileSystemLoader(directory)) self.template = self.env.get_template(file_name) self.data = {} - self.data['title'] = 'Profiler Report' - self.data['heading'] = 'Syntactic and Semantic Profile' - self.data['description'] = 'This report presents the detailed profiling report of the input dataset.' + self.data["title"] = "Profiler Report" + self.data["heading"] = "Syntactic and Semantic Profile" + self.data["description"] = "This report presents the detailed profiling report of the input dataset." def add_metric(self, metric_id, name, value_counts, graph_html=None): - if 'metrics' not in self.data: - self.data['metrics'] = [] - self.data['metrics'].append({ - 'id': metric_id, - 'name': name, - 'graph_html': graph_html, - 'value_counts': value_counts, - }) + if "metrics" not in self.data: + self.data["metrics"] = [] + self.data["metrics"].append( + { + "id": metric_id, + "name": name, + "graph_html": graph_html, + "value_counts": value_counts, + } + ) def render(self): return self.template.render(self.data) def save(self, output_file): output = self.render() - with open(output_file, 'w') as f: + with open(output_file, "w") as f: f.write(output) print(f"HTML file generated: {output_file}") @@ -144,23 +146,24 @@ def recursive_sort_by_value_counts(self, obj): def save_as_json(self, output_file): # Convert output_file to a Path object output_path = Path(output_file) - + # Check if the file already exists and get a unique filename if needed unique_output_path = self.get_unique_filename(output_path) - + # Remove non-serializable data, like HTML, if necessary serializable_data = self.data.copy() - for metric in serializable_data.get('metrics', []): - if 'graph_html' in metric: - del metric['graph_html'] # Remove or replace with raw data if necessary + for metric in serializable_data.get("metrics", []): + if "graph_html" in metric: + del metric["graph_html"] # Remove or replace with raw data if necessary sorted_data = self.recursive_sort_by_value_counts(serializable_data) # Save the report data as JSON - with open(unique_output_path, 'w') as json_file: + with open(unique_output_path, "w") as json_file: json.dump(sorted_data, json_file, indent=4) print(f"Report data saved as JSON: {unique_output_path}") + def generate_report(table: pa.Table, metrics_list): """ Generates the profiler report given the table name and the metrics list given as input by the user. @@ -172,11 +175,11 @@ def generate_report(table: pa.Table, metrics_list): host_name = socket.gethostname() timestamp = datetime.now().strftime("%Y%m%d-%H%M%S-%f")[:-3] # Truncate to milliseconds unique_suffix = f"{host_name}_{timestamp}" - + # Define file names with the unique suffix - template_file = str(script_dir / 'template.html') - output_html = str(script_dir / f'output_{unique_suffix}.html') - output_json = str(script_dir / f'output_{unique_suffix}.json') + template_file = str(script_dir / "template.html") + output_html = str(script_dir / f"output_{unique_suffix}.html") + output_json = str(script_dir / f"output_{unique_suffix}.json") report = Report(template_file) id = 0 @@ -187,6 +190,6 @@ def generate_report(table: pa.Table, metrics_list): filtered_column_data = [item for item in plot.column_data if item != ""] value_counts = dict(Counter(filtered_column_data)) report.add_metric(id, column_name, value_counts, plot_html) - id+=1 - report.save(output_html) - report.save_as_json(output_json) \ No newline at end of file + id += 1 + report.save(output_html) + report.save_as_json(output_json) diff --git a/transforms/code/code_profiler/ray/src/code_profiler_local_ray.py b/transforms/code/code_profiler/dpk_code_profiler/ray/local_ray.py similarity index 98% rename from transforms/code/code_profiler/ray/src/code_profiler_local_ray.py rename to transforms/code/code_profiler/dpk_code_profiler/ray/local_ray.py index 93983ec66..c10c40911 100644 --- a/transforms/code/code_profiler/ray/src/code_profiler_local_ray.py +++ b/transforms/code/code_profiler/dpk_code_profiler/ray/local_ray.py @@ -12,9 +12,11 @@ import os import sys + +from code_profiler_transform_ray import CodeProfilerRayTransformConfiguration from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from code_profiler_transform_ray import CodeProfilerRayTransformConfiguration + # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../", "../", "input")) @@ -25,7 +27,7 @@ "input_folder": input_folder, "output_folder": output_folder, "contents": "contents", - "language": "language" + "language": "language", } worker_options = {"num_cpus": 1} # Code location and parameters @@ -41,7 +43,7 @@ "runtime_pipeline_id": "pipeline_id", "runtime_job_id": "job_id", "runtime_creation_delay": 0, - "runtime_code_location": ParamsUtils.convert_to_ast(code_location) + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), } diff --git a/transforms/code/code_profiler/ray/src/code_profiler_s3_ray.py b/transforms/code/code_profiler/dpk_code_profiler/ray/s3_ray.py similarity index 99% rename from transforms/code/code_profiler/ray/src/code_profiler_s3_ray.py rename to transforms/code/code_profiler/dpk_code_profiler/ray/s3_ray.py index a603dbe16..18c89684b 100644 --- a/transforms/code/code_profiler/ray/src/code_profiler_s3_ray.py +++ b/transforms/code/code_profiler/dpk_code_profiler/ray/s3_ray.py @@ -12,10 +12,12 @@ import os import sys + from code_profiler_transform_ray import CodeProfilerRayTransformConfiguration from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher + # create parameters s3_cred = { "access_key": "localminioaccesskey", diff --git a/transforms/code/code_profiler/ray/src/semantic-ruleset/ikb_model.csv b/transforms/code/code_profiler/dpk_code_profiler/ray/semantic-ruleset/ikb_model.csv similarity index 100% rename from transforms/code/code_profiler/ray/src/semantic-ruleset/ikb_model.csv rename to transforms/code/code_profiler/dpk_code_profiler/ray/semantic-ruleset/ikb_model.csv diff --git a/transforms/code/code_profiler/ray/src/semantic-ruleset/null_libs.csv b/transforms/code/code_profiler/dpk_code_profiler/ray/semantic-ruleset/null_libs.csv similarity index 100% rename from transforms/code/code_profiler/ray/src/semantic-ruleset/null_libs.csv rename to transforms/code/code_profiler/dpk_code_profiler/ray/semantic-ruleset/null_libs.csv diff --git a/transforms/code/code_profiler/python/src/semantic-ruleset/offline-ikb-builder/concept_list.csv b/transforms/code/code_profiler/dpk_code_profiler/ray/semantic-ruleset/offline-ikb-builder/concept_list.csv similarity index 100% rename from transforms/code/code_profiler/python/src/semantic-ruleset/offline-ikb-builder/concept_list.csv rename to transforms/code/code_profiler/dpk_code_profiler/ray/semantic-ruleset/offline-ikb-builder/concept_list.csv diff --git a/transforms/code/code_profiler/python/src/semantic-ruleset/offline-ikb-builder/examples/examples-i.csv b/transforms/code/code_profiler/dpk_code_profiler/ray/semantic-ruleset/offline-ikb-builder/examples/examples-i.csv similarity index 100% rename from transforms/code/code_profiler/python/src/semantic-ruleset/offline-ikb-builder/examples/examples-i.csv rename to transforms/code/code_profiler/dpk_code_profiler/ray/semantic-ruleset/offline-ikb-builder/examples/examples-i.csv diff --git a/transforms/code/code_profiler/python/src/semantic-ruleset/offline-ikb-builder/examples/examples-o.csv b/transforms/code/code_profiler/dpk_code_profiler/ray/semantic-ruleset/offline-ikb-builder/examples/examples-o.csv similarity index 100% rename from transforms/code/code_profiler/python/src/semantic-ruleset/offline-ikb-builder/examples/examples-o.csv rename to transforms/code/code_profiler/dpk_code_profiler/ray/semantic-ruleset/offline-ikb-builder/examples/examples-o.csv diff --git a/transforms/code/code_profiler/ray/src/semantic-ruleset/offline-ikb-builder/generate_ikb.py b/transforms/code/code_profiler/dpk_code_profiler/ray/semantic-ruleset/offline-ikb-builder/generate_ikb.py similarity index 57% rename from transforms/code/code_profiler/ray/src/semantic-ruleset/offline-ikb-builder/generate_ikb.py rename to transforms/code/code_profiler/dpk_code_profiler/ray/semantic-ruleset/offline-ikb-builder/generate_ikb.py index a41a4e9a9..2c4e8fed9 100644 --- a/transforms/code/code_profiler/ray/src/semantic-ruleset/offline-ikb-builder/generate_ikb.py +++ b/transforms/code/code_profiler/dpk_code_profiler/ray/semantic-ruleset/offline-ikb-builder/generate_ikb.py @@ -10,12 +10,13 @@ # limitations under the License. ################################################################################ -import os import argparse import csv +import os +from io import BytesIO, StringIO + import pyarrow as pa import pyarrow.csv as pv -from io import StringIO,BytesIO from watsonxai import generateResponseWatsonx @@ -23,25 +24,24 @@ def getStringFromCSV(file): table = pv.read_csv(file) csv_buffer = StringIO() column_names = table.column_names - csv_buffer.write(','.join(column_names) + '\n') + csv_buffer.write(",".join(column_names) + "\n") for row in range(table.num_rows): row_data = [str(table[column][row].as_py()) for column in column_names] - csv_buffer.write(','.join(row_data) + '\n') + csv_buffer.write(",".join(row_data) + "\n") return csv_buffer.getvalue() - def gen_combined_strings(file_data): - file_data = file_data.splitlines() + file_data = file_data.splitlines() headers = file_data[0] null_libraries = file_data[1:] combined_strings = [] combined_string = "" for idx, entry in enumerate(null_libraries, start=1): - if combined_string == "": + if combined_string == "": combined_string += f"{headers.strip()}\n" combined_string += f"{entry}\n" - if idx % 30 == 0 or idx == len(null_libraries): + if idx % 30 == 0 or idx == len(null_libraries): combined_strings.append(combined_string) combined_string = "" return combined_strings @@ -49,19 +49,57 @@ def gen_combined_strings(file_data): if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Generate IKB.') - parser.add_argument('--null_libs_file', type=str, help='Path to null libraries file.', default=os.getenv('NULL_LIBS_FILE', '../ikb/null_libs.csv')) - parser.add_argument('--cmap_file', type=str, help='Path to concept map file.', default=os.getenv('CMAP_FILE', '../concept_map/updated_concept_list.csv')) - parser.add_argument('--input_examples_file', type=str, help='Path to input examples file.', default=os.getenv('EXAMPLES_I_FILE', '../examples/examples-i.csv')) - parser.add_argument('--output_examples_file', type=str, help='Path to output examples file.', default=os.getenv('EXAMPLES_O_FILE', '../examples/examples-o.csv')) - parser.add_argument('--extracted_data_file', type=str, help='Path to file in which LLM output will be stored.', default=os.getenv('EXTRACTED_DATA_FILE', '../ikb/extracted_data.csv')) - parser.add_argument('--api_type', type=str, help='API Type', default=os.getenv('API_TYPE', 'WatsonxAI')) - parser.add_argument('--api_key', type=str, help='API key', default=os.getenv('API_KEY', '')) - parser.add_argument('--api_endpoint', type=str, help='API endpoint', default=os.getenv('API_ENDPOINT', 'https://us-south.ml.cloud.ibm.com')) - parser.add_argument('--project_id', type=str, help='Project ID', default=os.getenv('PROJECT_ID', '')) - parser.add_argument('--model_id', type=str, help='LLM model ID', default=os.getenv('MODEL_ID', 'meta-llama/llama-3-70b-instruct')) - - + parser = argparse.ArgumentParser(description="Generate IKB.") + parser.add_argument( + "--null_libs_file", + type=str, + help="Path to null libraries file.", + default=os.getenv("NULL_LIBS_FILE", "../ikb/null_libs.csv"), + ) + parser.add_argument( + "--cmap_file", + type=str, + help="Path to concept map file.", + default=os.getenv("CMAP_FILE", "../concept_map/updated_concept_list.csv"), + ) + parser.add_argument( + "--input_examples_file", + type=str, + help="Path to input examples file.", + default=os.getenv("EXAMPLES_I_FILE", "../examples/examples-i.csv"), + ) + parser.add_argument( + "--output_examples_file", + type=str, + help="Path to output examples file.", + default=os.getenv("EXAMPLES_O_FILE", "../examples/examples-o.csv"), + ) + parser.add_argument( + "--extracted_data_file", + type=str, + help="Path to file in which LLM output will be stored.", + default=os.getenv("EXTRACTED_DATA_FILE", "../ikb/extracted_data.csv"), + ) + parser.add_argument( + "--api_type", + type=str, + help="API Type", + default=os.getenv("API_TYPE", "WatsonxAI"), + ) + parser.add_argument("--api_key", type=str, help="API key", default=os.getenv("API_KEY", "")) + parser.add_argument( + "--api_endpoint", + type=str, + help="API endpoint", + default=os.getenv("API_ENDPOINT", "https://us-south.ml.cloud.ibm.com"), + ) + parser.add_argument("--project_id", type=str, help="Project ID", default=os.getenv("PROJECT_ID", "")) + parser.add_argument( + "--model_id", + type=str, + help="LLM model ID", + default=os.getenv("MODEL_ID", "meta-llama/llama-3-70b-instruct"), + ) args = parser.parse_args() concepts = getStringFromCSV(args.cmap_file) @@ -69,12 +107,15 @@ def gen_combined_strings(file_data): output_examples = getStringFromCSV(args.output_examples_file) null_libs_file_data = getStringFromCSV(args.null_libs_file) - combined_strings = gen_combined_strings(null_libs_file_data) + combined_strings = gen_combined_strings(null_libs_file_data) endtoken = "" prompt_name = "My-prompt" - prompt_template = '''You are responsible for classifying programming language packages based on their functionality into one of the following STRICT categories: - ''' + concepts + ''' + prompt_template = ( + """You are responsible for classifying programming language packages based on their functionality into one of the following STRICT categories: + """ + + concepts + + """ Instructions: @@ -100,36 +141,38 @@ def gen_combined_strings(file_data): Examples: INPUT: - ''' + str(input_examples) + "OUTPUT:\n" + str(output_examples).strip("\n")+"\n" + """ + + str(input_examples) + + "OUTPUT:\n" + + str(output_examples).strip("\n") + + "\n" + ) headers = ["Library", "Language", "Category"] file_exists = os.path.exists(args.extracted_data_file) if not file_exists: - with open(args.extracted_data_file, mode='w', newline='') as f: - csv_writer = csv.writer(f, quoting=csv.QUOTE_NONE, escapechar='\\') - csv_writer.writerow(headers) - + with open(args.extracted_data_file, mode="w", newline="") as f: + csv_writer = csv.writer(f, quoting=csv.QUOTE_NONE, escapechar="\\") + csv_writer.writerow(headers) for combined_string in combined_strings: input_template = prompt_template + f"\n\nINPUT: {combined_string} \nOUTPUT: " - if args.api_type == 'WatsonxAI': - response = generateResponseWatsonx(args.api_key, args.api_endpoint, args.model_id, args.project_id, input_template) - data = response.split(endtoken)[0] - csv_file = BytesIO(data.strip().encode('utf-8')) + if args.api_type == "WatsonxAI": + response = generateResponseWatsonx( + args.api_key, + args.api_endpoint, + args.model_id, + args.project_id, + input_template, + ) + data = response.split(endtoken)[0] + csv_file = BytesIO(data.strip().encode("utf-8")) csv_content = data.splitlines() not_first_row = 0 - with open(args.extracted_data_file, mode='a', newline='') as f: - csv_writer = csv.writer(f, quoting=csv.QUOTE_NONE, escapechar='\\') + with open(args.extracted_data_file, mode="a", newline="") as f: + csv_writer = csv.writer(f, quoting=csv.QUOTE_NONE, escapechar="\\") for line in csv_content: if not_first_row: - row = line.split(',') + row = line.split(",") csv_writer.writerow(row) not_first_row = 1 - - - - - - - - diff --git a/transforms/code/code_profiler/ray/src/semantic-ruleset/offline-ikb-builder/watsonxai.py b/transforms/code/code_profiler/dpk_code_profiler/ray/semantic-ruleset/offline-ikb-builder/watsonxai.py similarity index 89% rename from transforms/code/code_profiler/ray/src/semantic-ruleset/offline-ikb-builder/watsonxai.py rename to transforms/code/code_profiler/dpk_code_profiler/ray/semantic-ruleset/offline-ikb-builder/watsonxai.py index d251af603..23086f1d3 100644 --- a/transforms/code/code_profiler/ray/src/semantic-ruleset/offline-ikb-builder/watsonxai.py +++ b/transforms/code/code_profiler/dpk_code_profiler/ray/semantic-ruleset/offline-ikb-builder/watsonxai.py @@ -10,25 +10,23 @@ # limitations under the License. ################################################################################ -from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams -from ibm_watsonx_ai.foundation_models import ModelInference from ibm_watsonx_ai import Credentials +from ibm_watsonx_ai.foundation_models import ModelInference +from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams + def generateResponseWatsonx(api_key, api_endpoint, model_id, project_id, input_template): credentials = Credentials(api_key=api_key, url=api_endpoint) parameters = { GenParams.DECODING_METHOD: "greedy", GenParams.MAX_NEW_TOKENS: 100, - GenParams.STOP_SEQUENCES: [""] + GenParams.STOP_SEQUENCES: [""], } model = ModelInference( - model_id=model_id, - params=parameters, + model_id=model_id, + params=parameters, credentials=credentials, - project_id=project_id) + project_id=project_id, + ) response = model.generate_text(input_template) return response - - - - diff --git a/transforms/code/code_profiler/ray/src/code_profiler_transform_ray.py b/transforms/code/code_profiler/dpk_code_profiler/ray/transform_ray.py similarity index 99% rename from transforms/code/code_profiler/ray/src/code_profiler_transform_ray.py rename to transforms/code/code_profiler/dpk_code_profiler/ray/transform_ray.py index 9d1c62234..509c1b044 100644 --- a/transforms/code/code_profiler/ray/src/code_profiler_transform_ray.py +++ b/transforms/code/code_profiler/dpk_code_profiler/ray/transform_ray.py @@ -16,11 +16,12 @@ CodeProfilerTransform, CodeProfilerTransformConfiguration, ) +from data_processing.utils import get_logger from data_processing_ray.runtime.ray import RayTransformLauncher from data_processing_ray.runtime.ray.runtime_configuration import ( RayTransformRuntimeConfiguration, ) -from data_processing.utils import get_logger + logger = get_logger(__name__) @@ -39,6 +40,7 @@ def __init__(self): """ super().__init__(transform_config=CodeProfilerTransformConfiguration(transform_class=CodeProfilerTransform)) + if __name__ == "__main__": print("In code_profiler_transform_ray") diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_agda.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_agda.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_agda.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_agda.json index 139f4ec5b..3b2d9dc7a 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_agda.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_agda.json @@ -11,4 +11,4 @@ "uast_node_type": "uast_comment", "extractor": "if code_snippet.startswith('{-'):\n # As the snippets start with '{-' and end with '-}', we can remove two characters from both ends to get the required snippet\n self.extracted = code_snippet[2:-2].strip()\nelif code_snippet.startswith('--'):\n # As the snippets start with '--', we can remove two characters from the start to get the required snippet\n self.extracted = code_snippet[2:].strip()" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_c.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_c.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_c.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_c.json index 020c51597..a169bf37b 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_c.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_c.json @@ -11,4 +11,4 @@ "uast_node_type": "uast_comment", "extractor": "if (code_snippet[0:2] == '//'):\n self.extracted = code_snippet[2:].strip()\nelse:\n self.extracted = code_snippet[2:-2].strip()" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_c_sharp.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_c_sharp.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_c_sharp.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_c_sharp.json index f0f32de95..65e3a88ef 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_c_sharp.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_c_sharp.json @@ -11,4 +11,4 @@ "uast_node_type": "uast_function", "extractor": "temp_0 = code_snippet.split('(')[0].strip() \nself.extracted = temp_0.split(' ')[-1].strip()" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_cpp.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_cpp.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_cpp.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_cpp.json index 4197a0ef4..68a68bfc6 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_cpp.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_cpp.json @@ -19,4 +19,4 @@ "uast_node_type": "uast_call", "extractor": "self.extracted = code_snippet.split('(', 1)[0].strip()" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_d.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_d.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_d.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_d.json index 6f532973a..f5e8c9b63 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_d.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_d.json @@ -11,4 +11,4 @@ "uast_node_type": "uast_comment", "extractor": "if code_snippet.startswith('/*') and code_snippet.endswith('*/'):\n self.extracted = code_snippet[2:-2].strip()\nelif code_snippet.startswith('//'):\n self.extracted = code_snippet[2:].strip()" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_dart.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_dart.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_dart.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_dart.json index 80c5104c2..eef28f5d6 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_dart.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_dart.json @@ -11,4 +11,4 @@ "uast_node_type": "uast_comment", "extractor": "if (code_snippet[0:2] == '//'):\n self.extracted = code_snippet[2:].strip()\nelse:\n self.extracted = code_snippet[2:-2].strip()" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_elm.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_elm.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_elm.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_elm.json index e0da3babb..61962650a 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_elm.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_elm.json @@ -15,4 +15,4 @@ "uast_node_type": "uast_comment", "extractor": "self.extracted = code_snippet[2:-2].strip()" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_go.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_go.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_go.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_go.json index c72cfcec2..e983a0355 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_go.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_go.json @@ -7,4 +7,4 @@ "uast_node_type": "uast_comment", "extractor": "if (code_snippet[0:2] == '//'):\n self.extracted = code_snippet[2:].strip()\nelse:\n self.extracted = code_snippet[2:-2].strip()" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_haskell.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_haskell.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_haskell.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_haskell.json index 6bb09d9d1..9259832ac 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_haskell.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_haskell.json @@ -11,4 +11,4 @@ "uast_node_type": "uast_comment", "extractor": "self.extracted = code_snippet[2:].strip()" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_java.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_java.json new file mode 100644 index 000000000..1f95fceea --- /dev/null +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_java.json @@ -0,0 +1,22 @@ +{ + "import_declaration": { + "uast_node_type": "uast_package", + "extractor": "self.extracted = code_snippet.split('import', 1)[1].strip(' ;')" + }, + "line_comment": { + "uast_node_type": "uast_comment", + "extractor": "self.extracted = code_snippet[2:].strip()\n" + }, + "block_comment": { + "uast_node_type": "uast_comment", + "extractor": "self.extracted = code_snippet[2:-2].strip()\n" + }, + "method_declaration": { + "uast_node_type": "uast_function", + "extractor": "self.extracted = code_snippet.split('(', 1)[0].strip().split(' ')[-1].strip()" + }, + "method_invocation": { + "uast_node_type": "uast_call", + "extractor": "self.extracted = code_snippet.split('(', 1)[0].strip()" + } +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_js.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_js.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_js.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_js.json index 6409d658b..7bcfd2d37 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_js.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_js.json @@ -7,4 +7,4 @@ "uast_node_type": "uast_package", "extractor": "text = code_snippet.split('import')[1].strip() \ntext = text.split('from')[1].strip()\ntext = text.replace('\"', '').replace('\\'', '').strip(' ;')\nself.extracted = text" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_kotlin.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_kotlin.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_kotlin.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_kotlin.json index c8ee1c54d..bfeb63fd7 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_kotlin.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_kotlin.json @@ -15,4 +15,4 @@ "uast_node_type": "uast_function", "extractor": "temp_0 = code_snippet.split('(')[0].strip() \nself.extracted = temp_0.split(' ')[-1].strip()" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_nim.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_nim.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_nim.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_nim.json index a5d090db7..f9d80e5e6 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_nim.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_nim.json @@ -23,4 +23,4 @@ "uast_node_type": "uast_function", "extractor": "temp_0 = code_snippet.split('proc')[1].split('(')[0].strip() \nself.extracted = temp_0" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_objc.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_objc.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_objc.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_objc.json index a3d1dd06a..5aa9fcc90 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_objc.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_objc.json @@ -15,4 +15,4 @@ "uast_node_type": "uast_comment", "extractor": "if (code_snippet[0:2] == '//'):\n self.extracted = code_snippet[2:].strip()\nelse:\n self.extracted = code_snippet[2:-2].strip()" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_ocaml.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_ocaml.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_ocaml.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_ocaml.json index 3ff506e04..4899e2613 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_ocaml.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_ocaml.json @@ -7,4 +7,4 @@ "uast_node_type": "uast_comment", "extractor": "self.extracted = code_snippet[3:-2].strip()" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_perl.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_perl.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_perl.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_perl.json index ac5185141..554073717 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_perl.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_perl.json @@ -7,4 +7,4 @@ "uast_node_type": "uast_function", "extractor": "temp_0 = code_snippet.split(' ')[1].strip() \nself.extracted = temp_0.split('{')[0].strip()" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_py.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_py.json new file mode 100644 index 000000000..4abc4940c --- /dev/null +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_py.json @@ -0,0 +1,26 @@ +{ + "import_statement": { + "uast_node_type": "uast_package", + "extractor": "text = code_snippet.split('import')[1].strip() \nif (',' in text):\n imports = text.split(',')\n all_imps = []\n for imp in imports:\n imp = imp.strip().split(' ')[0].strip()\n if ('.' in imp):\n imp = imp.split('.')[0]\n all_imps.append(imp)\n all_imps = list(set(all_imps))\n self.extracted = (', ').join(all_imps)\nelse:\n imp = text.strip().split(' ')[0].strip()\n if ('.' in imp):\n imp = imp.split('.')[0]\n self.extracted = imp\n" + }, + "import_from_statement": { + "uast_node_type": "uast_package", + "extractor": "text = code_snippet.split('from', 1)[1].strip()\ntext = text.split(' import')[0]\ntext = text.strip()\nif ('.' in text) :\n self.extracted = text.split('.')[0]\nelse:\n self.extracted = text\n" + }, + "comment": { + "uast_node_type": "uast_comment", + "extractor": "self.extracted = code_snippet[1:]" + }, + "function_definition": { + "uast_node_type": "uast_function", + "extractor": "self.extracted = code_snippet.split('(', 1)[0].strip().split(' ')[-1].strip()" + }, + "class_definition": { + "uast_node_type": "uast_class", + "extractor": "self.extracted = code_snippet.split('class ', 1)[1].split(':', 1)[0].strip()\nif ('(' in self.extracted):\n self.extracted = self.extracted.split('(', 1)[0].strip()" + }, + "call": { + "uast_node_type": "uast_call", + "extractor": "self.extracted = code_snippet.split('(', 1)[0].strip()" + } +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_qmljs.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_qmljs.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_qmljs.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_qmljs.json index e9f2e5257..d982aa4c3 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_qmljs.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_qmljs.json @@ -7,4 +7,4 @@ "uast_node_type": "uast_comment", "extractor": "if (code_snippet[0:2] == '//'):\n self.extracted = code_snippet[2:].strip()\nelse:\n self.extracted = code_snippet[2:-2].strip()" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_rust.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_rust.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_rust.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_rust.json index 80c48156b..e47d154ee 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_rust.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_rust.json @@ -11,4 +11,4 @@ "uast_node_type": "uast_comment", "extractor": "self.extracted = code_snippet[2:].strip()" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_scala.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_scala.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_scala.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_scala.json index 2f96ffeb0..3d0b079a3 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_scala.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_scala.json @@ -15,4 +15,4 @@ "uast_node_type": "uast_function", "extractor": "temp_0 = code_snippet.split('def')[1].strip() \ntemp_1 = temp_0.split('(')[0].strip() \ntemp_2 = temp_1.split(':')[0].strip() \nself.extracted = temp_2" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_ts.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_ts.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_ts.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_ts.json index 064ecd819..3f28821eb 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_ts.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_ts.json @@ -11,4 +11,4 @@ "uast_node_type": "uast_comment", "extractor": "if (code_snippet[0:2] == '//'):\n self.extracted = code_snippet[2:].strip()\nelse:\n self.extracted = code_snippet[2:-2].strip()" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_typescript.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_typescript.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_typescript.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_typescript.json index 064ecd819..3f28821eb 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_typescript.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_typescript.json @@ -11,4 +11,4 @@ "uast_node_type": "uast_comment", "extractor": "if (code_snippet[0:2] == '//'):\n self.extracted = code_snippet[2:].strip()\nelse:\n self.extracted = code_snippet[2:-2].strip()" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_verilog.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_verilog.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_verilog.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_verilog.json index e8eb728e1..f5f65bec7 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_verilog.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_verilog.json @@ -15,4 +15,4 @@ "uast_node_type": "uast_function", "extractor": "self.extracted = code_snippet.strip()" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_vhdl.json b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_vhdl.json similarity index 99% rename from transforms/code/code_profiler/python/src/ruleset/UAST_rules_vhdl.json rename to transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_vhdl.json index 861f310dd..7e477bb07 100644 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_vhdl.json +++ b/transforms/code/code_profiler/dpk_code_profiler/ruleset/UAST_rules_vhdl.json @@ -11,4 +11,4 @@ "uast_node_type": "uast_function", "extractor": "temp_0 = code_snippet.split('(')[0].strip() \nself.extracted = temp_0.split(' ')[-1].strip()" } -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/semantic-ruleset/ikb_model.csv b/transforms/code/code_profiler/dpk_code_profiler/semantic-ruleset/ikb_model.csv similarity index 100% rename from transforms/code/code_profiler/python/src/semantic-ruleset/ikb_model.csv rename to transforms/code/code_profiler/dpk_code_profiler/semantic-ruleset/ikb_model.csv diff --git a/transforms/code/code_profiler/python/src/semantic-ruleset/null_libs.csv b/transforms/code/code_profiler/dpk_code_profiler/semantic-ruleset/null_libs.csv similarity index 100% rename from transforms/code/code_profiler/python/src/semantic-ruleset/null_libs.csv rename to transforms/code/code_profiler/dpk_code_profiler/semantic-ruleset/null_libs.csv diff --git a/transforms/code/code_profiler/ray/src/semantic-ruleset/offline-ikb-builder/concept_list.csv b/transforms/code/code_profiler/dpk_code_profiler/semantic-ruleset/offline-ikb-builder/concept_list.csv similarity index 100% rename from transforms/code/code_profiler/ray/src/semantic-ruleset/offline-ikb-builder/concept_list.csv rename to transforms/code/code_profiler/dpk_code_profiler/semantic-ruleset/offline-ikb-builder/concept_list.csv diff --git a/transforms/code/code_profiler/ray/src/semantic-ruleset/offline-ikb-builder/examples/examples-i.csv b/transforms/code/code_profiler/dpk_code_profiler/semantic-ruleset/offline-ikb-builder/examples/examples-i.csv similarity index 100% rename from transforms/code/code_profiler/ray/src/semantic-ruleset/offline-ikb-builder/examples/examples-i.csv rename to transforms/code/code_profiler/dpk_code_profiler/semantic-ruleset/offline-ikb-builder/examples/examples-i.csv diff --git a/transforms/code/code_profiler/ray/src/semantic-ruleset/offline-ikb-builder/examples/examples-o.csv b/transforms/code/code_profiler/dpk_code_profiler/semantic-ruleset/offline-ikb-builder/examples/examples-o.csv similarity index 100% rename from transforms/code/code_profiler/ray/src/semantic-ruleset/offline-ikb-builder/examples/examples-o.csv rename to transforms/code/code_profiler/dpk_code_profiler/semantic-ruleset/offline-ikb-builder/examples/examples-o.csv diff --git a/transforms/code/code_profiler/python/src/semantic-ruleset/offline-ikb-builder/generate_ikb.py b/transforms/code/code_profiler/dpk_code_profiler/semantic-ruleset/offline-ikb-builder/generate_ikb.py similarity index 57% rename from transforms/code/code_profiler/python/src/semantic-ruleset/offline-ikb-builder/generate_ikb.py rename to transforms/code/code_profiler/dpk_code_profiler/semantic-ruleset/offline-ikb-builder/generate_ikb.py index a41a4e9a9..2c4e8fed9 100644 --- a/transforms/code/code_profiler/python/src/semantic-ruleset/offline-ikb-builder/generate_ikb.py +++ b/transforms/code/code_profiler/dpk_code_profiler/semantic-ruleset/offline-ikb-builder/generate_ikb.py @@ -10,12 +10,13 @@ # limitations under the License. ################################################################################ -import os import argparse import csv +import os +from io import BytesIO, StringIO + import pyarrow as pa import pyarrow.csv as pv -from io import StringIO,BytesIO from watsonxai import generateResponseWatsonx @@ -23,25 +24,24 @@ def getStringFromCSV(file): table = pv.read_csv(file) csv_buffer = StringIO() column_names = table.column_names - csv_buffer.write(','.join(column_names) + '\n') + csv_buffer.write(",".join(column_names) + "\n") for row in range(table.num_rows): row_data = [str(table[column][row].as_py()) for column in column_names] - csv_buffer.write(','.join(row_data) + '\n') + csv_buffer.write(",".join(row_data) + "\n") return csv_buffer.getvalue() - def gen_combined_strings(file_data): - file_data = file_data.splitlines() + file_data = file_data.splitlines() headers = file_data[0] null_libraries = file_data[1:] combined_strings = [] combined_string = "" for idx, entry in enumerate(null_libraries, start=1): - if combined_string == "": + if combined_string == "": combined_string += f"{headers.strip()}\n" combined_string += f"{entry}\n" - if idx % 30 == 0 or idx == len(null_libraries): + if idx % 30 == 0 or idx == len(null_libraries): combined_strings.append(combined_string) combined_string = "" return combined_strings @@ -49,19 +49,57 @@ def gen_combined_strings(file_data): if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Generate IKB.') - parser.add_argument('--null_libs_file', type=str, help='Path to null libraries file.', default=os.getenv('NULL_LIBS_FILE', '../ikb/null_libs.csv')) - parser.add_argument('--cmap_file', type=str, help='Path to concept map file.', default=os.getenv('CMAP_FILE', '../concept_map/updated_concept_list.csv')) - parser.add_argument('--input_examples_file', type=str, help='Path to input examples file.', default=os.getenv('EXAMPLES_I_FILE', '../examples/examples-i.csv')) - parser.add_argument('--output_examples_file', type=str, help='Path to output examples file.', default=os.getenv('EXAMPLES_O_FILE', '../examples/examples-o.csv')) - parser.add_argument('--extracted_data_file', type=str, help='Path to file in which LLM output will be stored.', default=os.getenv('EXTRACTED_DATA_FILE', '../ikb/extracted_data.csv')) - parser.add_argument('--api_type', type=str, help='API Type', default=os.getenv('API_TYPE', 'WatsonxAI')) - parser.add_argument('--api_key', type=str, help='API key', default=os.getenv('API_KEY', '')) - parser.add_argument('--api_endpoint', type=str, help='API endpoint', default=os.getenv('API_ENDPOINT', 'https://us-south.ml.cloud.ibm.com')) - parser.add_argument('--project_id', type=str, help='Project ID', default=os.getenv('PROJECT_ID', '')) - parser.add_argument('--model_id', type=str, help='LLM model ID', default=os.getenv('MODEL_ID', 'meta-llama/llama-3-70b-instruct')) - - + parser = argparse.ArgumentParser(description="Generate IKB.") + parser.add_argument( + "--null_libs_file", + type=str, + help="Path to null libraries file.", + default=os.getenv("NULL_LIBS_FILE", "../ikb/null_libs.csv"), + ) + parser.add_argument( + "--cmap_file", + type=str, + help="Path to concept map file.", + default=os.getenv("CMAP_FILE", "../concept_map/updated_concept_list.csv"), + ) + parser.add_argument( + "--input_examples_file", + type=str, + help="Path to input examples file.", + default=os.getenv("EXAMPLES_I_FILE", "../examples/examples-i.csv"), + ) + parser.add_argument( + "--output_examples_file", + type=str, + help="Path to output examples file.", + default=os.getenv("EXAMPLES_O_FILE", "../examples/examples-o.csv"), + ) + parser.add_argument( + "--extracted_data_file", + type=str, + help="Path to file in which LLM output will be stored.", + default=os.getenv("EXTRACTED_DATA_FILE", "../ikb/extracted_data.csv"), + ) + parser.add_argument( + "--api_type", + type=str, + help="API Type", + default=os.getenv("API_TYPE", "WatsonxAI"), + ) + parser.add_argument("--api_key", type=str, help="API key", default=os.getenv("API_KEY", "")) + parser.add_argument( + "--api_endpoint", + type=str, + help="API endpoint", + default=os.getenv("API_ENDPOINT", "https://us-south.ml.cloud.ibm.com"), + ) + parser.add_argument("--project_id", type=str, help="Project ID", default=os.getenv("PROJECT_ID", "")) + parser.add_argument( + "--model_id", + type=str, + help="LLM model ID", + default=os.getenv("MODEL_ID", "meta-llama/llama-3-70b-instruct"), + ) args = parser.parse_args() concepts = getStringFromCSV(args.cmap_file) @@ -69,12 +107,15 @@ def gen_combined_strings(file_data): output_examples = getStringFromCSV(args.output_examples_file) null_libs_file_data = getStringFromCSV(args.null_libs_file) - combined_strings = gen_combined_strings(null_libs_file_data) + combined_strings = gen_combined_strings(null_libs_file_data) endtoken = "" prompt_name = "My-prompt" - prompt_template = '''You are responsible for classifying programming language packages based on their functionality into one of the following STRICT categories: - ''' + concepts + ''' + prompt_template = ( + """You are responsible for classifying programming language packages based on their functionality into one of the following STRICT categories: + """ + + concepts + + """ Instructions: @@ -100,36 +141,38 @@ def gen_combined_strings(file_data): Examples: INPUT: - ''' + str(input_examples) + "OUTPUT:\n" + str(output_examples).strip("\n")+"\n" + """ + + str(input_examples) + + "OUTPUT:\n" + + str(output_examples).strip("\n") + + "\n" + ) headers = ["Library", "Language", "Category"] file_exists = os.path.exists(args.extracted_data_file) if not file_exists: - with open(args.extracted_data_file, mode='w', newline='') as f: - csv_writer = csv.writer(f, quoting=csv.QUOTE_NONE, escapechar='\\') - csv_writer.writerow(headers) - + with open(args.extracted_data_file, mode="w", newline="") as f: + csv_writer = csv.writer(f, quoting=csv.QUOTE_NONE, escapechar="\\") + csv_writer.writerow(headers) for combined_string in combined_strings: input_template = prompt_template + f"\n\nINPUT: {combined_string} \nOUTPUT: " - if args.api_type == 'WatsonxAI': - response = generateResponseWatsonx(args.api_key, args.api_endpoint, args.model_id, args.project_id, input_template) - data = response.split(endtoken)[0] - csv_file = BytesIO(data.strip().encode('utf-8')) + if args.api_type == "WatsonxAI": + response = generateResponseWatsonx( + args.api_key, + args.api_endpoint, + args.model_id, + args.project_id, + input_template, + ) + data = response.split(endtoken)[0] + csv_file = BytesIO(data.strip().encode("utf-8")) csv_content = data.splitlines() not_first_row = 0 - with open(args.extracted_data_file, mode='a', newline='') as f: - csv_writer = csv.writer(f, quoting=csv.QUOTE_NONE, escapechar='\\') + with open(args.extracted_data_file, mode="a", newline="") as f: + csv_writer = csv.writer(f, quoting=csv.QUOTE_NONE, escapechar="\\") for line in csv_content: if not_first_row: - row = line.split(',') + row = line.split(",") csv_writer.writerow(row) not_first_row = 1 - - - - - - - - diff --git a/transforms/code/code_profiler/python/src/semantic-ruleset/offline-ikb-builder/watsonxai.py b/transforms/code/code_profiler/dpk_code_profiler/semantic-ruleset/offline-ikb-builder/watsonxai.py similarity index 89% rename from transforms/code/code_profiler/python/src/semantic-ruleset/offline-ikb-builder/watsonxai.py rename to transforms/code/code_profiler/dpk_code_profiler/semantic-ruleset/offline-ikb-builder/watsonxai.py index a51efeb16..23086f1d3 100644 --- a/transforms/code/code_profiler/python/src/semantic-ruleset/offline-ikb-builder/watsonxai.py +++ b/transforms/code/code_profiler/dpk_code_profiler/semantic-ruleset/offline-ikb-builder/watsonxai.py @@ -10,11 +10,9 @@ # limitations under the License. ################################################################################ -from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams -from ibm_watsonx_ai.foundation_models import ModelInference from ibm_watsonx_ai import Credentials - - +from ibm_watsonx_ai.foundation_models import ModelInference +from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams def generateResponseWatsonx(api_key, api_endpoint, model_id, project_id, input_template): @@ -22,16 +20,13 @@ def generateResponseWatsonx(api_key, api_endpoint, model_id, project_id, input_t parameters = { GenParams.DECODING_METHOD: "greedy", GenParams.MAX_NEW_TOKENS: 100, - GenParams.STOP_SEQUENCES: [""] + GenParams.STOP_SEQUENCES: [""], } model = ModelInference( - model_id=model_id, - params=parameters, + model_id=model_id, + params=parameters, credentials=credentials, - project_id=project_id) + project_id=project_id, + ) response = model.generate_text(input_template) return response - - - - diff --git a/transforms/code/code_profiler/python/src/semantic_concepts.py b/transforms/code/code_profiler/dpk_code_profiler/semantic_concepts.py similarity index 79% rename from transforms/code/code_profiler/python/src/semantic_concepts.py rename to transforms/code/code_profiler/dpk_code_profiler/semantic_concepts.py index a7f004edd..970f089e7 100644 --- a/transforms/code/code_profiler/python/src/semantic_concepts.py +++ b/transforms/code/code_profiler/dpk_code_profiler/semantic_concepts.py @@ -10,24 +10,27 @@ # limitations under the License. ################################################################################ -import pyarrow.csv as pacsv import csv +import pyarrow.csv as pacsv class TrieNode: - ''' + """ Implements one node of a Trie datastructure - ''' + """ + def __init__(self): self.children = {} self.is_end_of_word = False self.data = None + class Trie: - ''' + """ Implements a Trie datastructure for efficient retrieval of concepts from the IKB. - ''' + """ + def __init__(self): self.root = TrieNode() @@ -38,15 +41,15 @@ def insert(self, library_name, programming_language, functionality): node.children[char] = TrieNode() node = node.children[char] node.data = {} - node.data['Category'] = functionality - node.data['Language'] = programming_language + node.data["Category"] = functionality + node.data["Language"] = programming_language node.is_end_of_word = True def search(self, library_name, programming_language): node = self.root for char in library_name: if char not in node.children: - return None + return None node = node.children[char] if node.is_end_of_word and node.data: return node.data @@ -54,41 +57,42 @@ def search(self, library_name, programming_language): class knowledge_base: - ''' + """ Implements the internal knowledge base. - ''' - knowledge_base_file = '' - null_file = '' + """ + + knowledge_base_file = "" + null_file = "" knowledge_base_table = None knowledge_base_trie = None entries_with_null_coverage = set() - + def __init__(self, ikb_file, null_libs_file): self.knowledge_base_file = ikb_file - self.null_file = null_libs_file + self.null_file = null_libs_file def load_ikb_trie(self): self.knowledge_base_table = pacsv.read_csv(self.knowledge_base_file) self.knowledge_base_trie = Trie() - library_column = self.knowledge_base_table.column('Library').to_pylist() - language_column = self.knowledge_base_table.column('Language').to_pylist() - category_column = self.knowledge_base_table.column('Category').to_pylist() + library_column = self.knowledge_base_table.column("Library").to_pylist() + language_column = self.knowledge_base_table.column("Language").to_pylist() + category_column = self.knowledge_base_table.column("Category").to_pylist() for library, language, category in zip(library_column, language_column, category_column): self.knowledge_base_trie.insert(str.lower(library), language, category) def write_null_files(self): - with open(self.null_file, 'a+', newline='', encoding='utf-8') as csvfile: + with open(self.null_file, "a+", newline="", encoding="utf-8") as csvfile: writer = csv.writer(csvfile) for entry in self.entries_with_null_coverage: writer.writerow([entry[0], entry[1]]) self.entries_with_null_coverage = set() -def concept_extractor(libraries,language,ikb): - ''' +def concept_extractor(libraries, language, ikb): + """ Given a set of libraries and the corresponding programming language along with the IKB trie, this function returns the matching concept(s) as a comma separated list joined into a string. - ''' + """ concept_coverage = set() language = language libraries = [item.strip() for item in libraries.split(",")] @@ -97,16 +101,17 @@ def concept_extractor(libraries,language,ikb): extracted_base_name = str.lower(library) matched_entry = ikb.knowledge_base_trie.search(extracted_base_name, language) if matched_entry: - concept_coverage.add(matched_entry['Category'].strip()) + concept_coverage.add(matched_entry["Category"].strip()) else: - ikb.entries_with_null_coverage.add((library,language)) - return ','.join(sorted(list(concept_coverage))) + ikb.entries_with_null_coverage.add((library, language)) + return ",".join(sorted(list(concept_coverage))) + def concept_extractor(libraries, language, ikb): - ''' + """ Given a set of libraries and the corresponding programming language along with the IKB trie, this function returns the matching concept(s) as a comma-separated list joined into a string. - ''' + """ concept_coverage = set() language = language # Check if libraries is None or empty @@ -120,8 +125,7 @@ def concept_extractor(libraries, language, ikb): extracted_base_name = str.lower(library) matched_entry = ikb.knowledge_base_trie.search(extracted_base_name, language) if matched_entry: - concept_coverage.add(matched_entry['Category'].strip()) + concept_coverage.add(matched_entry["Category"].strip()) else: ikb.entries_with_null_coverage.add((library, language)) - return ','.join(sorted(list(concept_coverage))) - + return ",".join(sorted(list(concept_coverage))) diff --git a/transforms/code/code_profiler/python/src/template.html b/transforms/code/code_profiler/dpk_code_profiler/template.html similarity index 100% rename from transforms/code/code_profiler/python/src/template.html rename to transforms/code/code_profiler/dpk_code_profiler/template.html diff --git a/transforms/code/code_profiler/python/src/tool_utils/aggregate_report.py b/transforms/code/code_profiler/dpk_code_profiler/tool_utils/aggregate_report.py similarity index 87% rename from transforms/code/code_profiler/python/src/tool_utils/aggregate_report.py rename to transforms/code/code_profiler/dpk_code_profiler/tool_utils/aggregate_report.py index 067fddca2..8851b7968 100644 --- a/transforms/code/code_profiler/python/src/tool_utils/aggregate_report.py +++ b/transforms/code/code_profiler/dpk_code_profiler/tool_utils/aggregate_report.py @@ -9,11 +9,11 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ -import json import glob -from collections import defaultdict, Counter -from pathlib import Path +import json +from collections import Counter, defaultdict from json.decoder import JSONDecodeError +from pathlib import Path def aggregate_metrics(input_dir, output_file): @@ -23,12 +23,12 @@ def aggregate_metrics(input_dir, output_file): # Loop through each JSON file in the specified directory for json_file in glob.glob(f"{input_dir}/output_*.json"): try: - with open(json_file, 'r') as f: + with open(json_file, "r") as f: data = json.load(f) for metric in data.get("metrics", []): metric_name = metric["name"] value_counts = metric.get("value_counts", {}) - + # Aggregate counts for each unique metric for key, count in value_counts.items(): aggregated_metrics[metric_name][key] += count @@ -39,21 +39,19 @@ def aggregate_metrics(input_dir, output_file): "title": "Aggregated Profiler Report", "heading": "Aggregated Syntactic and Semantic Profile", "description": "This report presents aggregated profiling data across multiple JSON files.", - "metrics": [] + "metrics": [], } # Convert aggregated metrics to the required format for metric_name, value_counts in aggregated_metrics.items(): - aggregated_data["metrics"].append({ - "name": metric_name, - "value_counts": dict(value_counts) - }) + aggregated_data["metrics"].append({"name": metric_name, "value_counts": dict(value_counts)}) # Write the aggregated results to the output JSON file - with open(output_file, 'w') as f: + with open(output_file, "w") as f: json.dump(aggregated_data, f, indent=4) - + print(f"Aggregated report saved as: {output_file}") + # Usage example -aggregate_metrics(input_dir="../", output_file="aggregated_output.json") \ No newline at end of file +aggregate_metrics(input_dir="../", output_file="aggregated_output.json") diff --git a/transforms/code/code_profiler/python/src/tool_utils/aggregated_output_wca_ept_1.json b/transforms/code/code_profiler/dpk_code_profiler/tool_utils/aggregated_output_wca_ept_1.json similarity index 99% rename from transforms/code/code_profiler/python/src/tool_utils/aggregated_output_wca_ept_1.json rename to transforms/code/code_profiler/dpk_code_profiler/tool_utils/aggregated_output_wca_ept_1.json index 20bd470ae..eee40a5e3 100644 --- a/transforms/code/code_profiler/python/src/tool_utils/aggregated_output_wca_ept_1.json +++ b/transforms/code/code_profiler/dpk_code_profiler/tool_utils/aggregated_output_wca_ept_1.json @@ -67754,4 +67754,4 @@ } } ] -} \ No newline at end of file +} diff --git a/transforms/code/code_profiler/python/src/tool_utils/report_stats_generation.py b/transforms/code/code_profiler/dpk_code_profiler/tool_utils/report_stats_generation.py similarity index 90% rename from transforms/code/code_profiler/python/src/tool_utils/report_stats_generation.py rename to transforms/code/code_profiler/dpk_code_profiler/tool_utils/report_stats_generation.py index c7507a6fa..5566abad9 100644 --- a/transforms/code/code_profiler/python/src/tool_utils/report_stats_generation.py +++ b/transforms/code/code_profiler/dpk_code_profiler/tool_utils/report_stats_generation.py @@ -10,15 +10,17 @@ # limitations under the License. ################################################################################ -import numpy as np -import matplotlib.pyplot as plt import json import os +import matplotlib.pyplot as plt +import numpy as np + + def process_and_save(value_counts, metric_name, output_data): # Separate numeric keys, skipping "0" numeric_keys = {} - + for k, v in value_counts.items(): if k == "0": # Skip the key "0" continue @@ -30,7 +32,7 @@ def process_and_save(value_counts, metric_name, output_data): # Sort numeric keys keys = np.array(list(numeric_keys.keys())) counts = np.array([numeric_keys[key] for key in keys]) - + sorted_indices = np.argsort(keys) keys = keys[sorted_indices] counts = counts[sorted_indices] @@ -51,33 +53,34 @@ def process_and_save(value_counts, metric_name, output_data): # Distribution (Bar Plot) plt.subplot(3, 1, 1) - plt.bar(keys, counts, width=0.4, color='skyblue', alpha=0.8) + plt.bar(keys, counts, width=0.4, color="skyblue", alpha=0.8) plt.title(f"Distribution of {metric_name}") plt.xlabel("Values") plt.ylabel("Counts") - plt.grid(axis='y', linestyle='--', alpha=0.7) + plt.grid(axis="y", linestyle="--", alpha=0.7) # PDF (Bar Plot) plt.subplot(3, 1, 2) - plt.bar(keys, pdf, width=0.4, color='blue', alpha=0.7) + plt.bar(keys, pdf, width=0.4, color="blue", alpha=0.7) plt.title(f"PDF (Probability Density Function) of {metric_name}") plt.xlabel("Values") plt.ylabel("Probability") - plt.grid(axis='y', linestyle='--', alpha=0.7) + plt.grid(axis="y", linestyle="--", alpha=0.7) # CDF (Line Plot) plt.subplot(3, 1, 3) - plt.plot(keys, cdf, marker='o', color='orange', linewidth=2) + plt.plot(keys, cdf, marker="o", color="orange", linewidth=2) plt.title(f"CDF (Cumulative Distribution Function) of {metric_name}") plt.xlabel("Values") plt.ylabel("Cumulative Probability") - plt.grid(axis='y', linestyle='--', alpha=0.7) + plt.grid(axis="y", linestyle="--", alpha=0.7) # Save plot as PNG plt.tight_layout() plt.savefig(f"{metric_name}_distribution.png") plt.close() + # Read JSON input input_file = "aggregated_output.json" # Adjust path as needed output_file = "report_stats_generation.json" # Output JSON file diff --git a/transforms/code/code_profiler/python/src/code_profiler_transform.py b/transforms/code/code_profiler/dpk_code_profiler/transform.py similarity index 79% rename from transforms/code/code_profiler/python/src/code_profiler_transform.py rename to transforms/code/code_profiler/dpk_code_profiler/transform.py index d42847a1d..1fffcdc7a 100644 --- a/transforms/code/code_profiler/python/src/code_profiler_transform.py +++ b/transforms/code/code_profiler/dpk_code_profiler/transform.py @@ -10,32 +10,29 @@ # limitations under the License. ################################################################################ +import atexit +import json import os +import shutil import subprocess +import uuid from argparse import ArgumentParser, Namespace from typing import Any -from data_processing.utils import get_logger -import uuid -import shutil -import atexit import pyarrow as pa -from data_processing.transform import AbstractTableTransform -from tree_sitter import Language, Parser as TSParser -from tree_sitter_languages import get_language - - -from UAST_parser import UASTParser, uast_read -import json -from data_processing.transform import AbstractBinaryTransform, TransformConfiguration - -from data_processing.utils import ( - CLIArgumentProvider, - get_logger, +from data_processing.transform import ( + AbstractBinaryTransform, + AbstractTableTransform, + TransformConfiguration, ) -from semantic_concepts import * +from data_processing.utils import CLIArgumentProvider, get_logger from higher_order_concepts import * from profiler_report import * +from semantic_concepts import * +from tree_sitter import Language +from tree_sitter import Parser as TSParser +from tree_sitter_languages import get_language +from UAST_parser import UASTParser, uast_read short_name = "CodeProfiler" @@ -43,6 +40,7 @@ language = "language" contents = "contents" + class CodeProfilerTransform(AbstractTableTransform): """ Implements a simple copy of a pyarrow Table. @@ -54,12 +52,12 @@ def __init__(self, config: dict[str, Any]): """ super().__init__(config) - + self.contents = self.config.get("contents", "contents") - self.language = self.config.get("language", "language") + self.language = self.config.get("language", "language") if not isinstance(self.contents, str): - raise ValueError(f"'contents' should be a string, got {type(self.contents).__name__}") + raise ValueError(f"'contents' should be a string, got {type(self.contents).__name__}") def ensure_tree_sitter_bindings(): # Get the directory where the script is located @@ -69,17 +67,24 @@ def ensure_tree_sitter_bindings(): # Clone the bindings only if the unique directory does not exist if not os.path.exists(bindings_dir): print(f"Cloning tree-sitter bindings into {bindings_dir}...") - result = subprocess.run(["git", "clone", "https://github.com/pankajskku/tree-sitter-bindings.git", bindings_dir]) + result = subprocess.run( + [ + "git", + "clone", + "https://github.com/pankajskku/tree-sitter-bindings.git", + bindings_dir, + ] + ) if result.returncode != 0: raise RuntimeError(f"Failed to clone tree-sitter bindings into {bindings_dir}") - return bindings_dir + return bindings_dir # Call this function before the main code execution self.bindings_dir = ensure_tree_sitter_bindings() # Use the correct architecture for runtime - RUNTIME_HOST_ARCH = os.environ.get('RUNTIME_HOST_ARCH', 'x86_64') - bindings_path = self.bindings_dir + '/' + RUNTIME_HOST_ARCH # for MAC: mach-arm64 + RUNTIME_HOST_ARCH = os.environ.get("RUNTIME_HOST_ARCH", "x86_64") + bindings_path = self.bindings_dir + "/" + RUNTIME_HOST_ARCH # for MAC: mach-arm64 print(f"Bindings bindings_dir: {self.bindings_dir}") print(f"Bindings path: {bindings_path}") @@ -88,26 +93,26 @@ def ensure_tree_sitter_bindings(): raise FileNotFoundError(f"Bindings path does not exist: {bindings_path}") try: - AGDA_LANGUAGE = Language(os.path.join(bindings_path, 'agda-bindings.so'), 'agda') - C_LANGUAGE = get_language('c') + AGDA_LANGUAGE = Language(os.path.join(bindings_path, "agda-bindings.so"), "agda") + C_LANGUAGE = get_language("c") CPP_LANGUAGE = get_language("cpp") - CSHARP_LANGUAGE = Language(os.path.join(bindings_path, 'c_sharp-bindings.so'), 'c_sharp') - D_LANGUAGE = Language(os.path.join(bindings_path, 'd-bindings.so'), 'd') - DART_LANGUAGE = Language(os.path.join(bindings_path, 'dart-bindings.so'), 'dart') - ELM_LANGUAGE = Language(os.path.join(bindings_path, 'elm-bindings.so'), 'elm') - GOLANG_LANGUAGE = Language(os.path.join(bindings_path, 'go-bindings.so'), 'go') - HASKELL_LANGUAGE = Language(os.path.join(bindings_path, 'haskell-bindings.so'), 'haskell') + CSHARP_LANGUAGE = Language(os.path.join(bindings_path, "c_sharp-bindings.so"), "c_sharp") + D_LANGUAGE = Language(os.path.join(bindings_path, "d-bindings.so"), "d") + DART_LANGUAGE = Language(os.path.join(bindings_path, "dart-bindings.so"), "dart") + ELM_LANGUAGE = Language(os.path.join(bindings_path, "elm-bindings.so"), "elm") + GOLANG_LANGUAGE = Language(os.path.join(bindings_path, "go-bindings.so"), "go") + HASKELL_LANGUAGE = Language(os.path.join(bindings_path, "haskell-bindings.so"), "haskell") JAVA_LANGUAGE = get_language("java") - JAVASCRIPT_LANGUAGE = Language(os.path.join(bindings_path, 'js-bindings.so'), 'javascript') - KOTLIN_LANGUAGE = Language(os.path.join(bindings_path, 'kotlin-bindings.so'), 'kotlin') - NIM_LANGUAGE = Language(os.path.join(bindings_path, 'nim-bindings.so'), 'nim') - #OBJECTIVE_C_LANGUAGE = Language(os.path.join(bindings_path, 'objc-bindings.so'), 'objc') + JAVASCRIPT_LANGUAGE = Language(os.path.join(bindings_path, "js-bindings.so"), "javascript") + KOTLIN_LANGUAGE = Language(os.path.join(bindings_path, "kotlin-bindings.so"), "kotlin") + NIM_LANGUAGE = Language(os.path.join(bindings_path, "nim-bindings.so"), "nim") + # OBJECTIVE_C_LANGUAGE = Language(os.path.join(bindings_path, 'objc-bindings.so'), 'objc') OCAML_LANGUAGE = get_language("ocaml") PERL_LANGUAGE = get_language("perl") PY_LANGUAGE = get_language("python") - QMLJS_LANGUAGE = Language(os.path.join(bindings_path, 'qmljs-bindings.so'), 'qmljs') + QMLJS_LANGUAGE = Language(os.path.join(bindings_path, "qmljs-bindings.so"), "qmljs") RUST_LANGUAGE = get_language("rust") - SCALA_LANGUAGE = Language(os.path.join(bindings_path, 'scala-bindings.so'), 'scala') + SCALA_LANGUAGE = Language(os.path.join(bindings_path, "scala-bindings.so"), "scala") TYPESCRIPT_LANGUAGE = get_language("typescript") except Exception as e: self.clean_bindings() @@ -123,7 +128,7 @@ def ensure_tree_sitter_bindings(): "Cpp": CPP_LANGUAGE, "D": D_LANGUAGE, "Dart": DART_LANGUAGE, - "Elm" : ELM_LANGUAGE, + "Elm": ELM_LANGUAGE, "Go": GOLANG_LANGUAGE, "Haskell": HASKELL_LANGUAGE, "Java": JAVA_LANGUAGE, @@ -131,37 +136,37 @@ def ensure_tree_sitter_bindings(): "Kotlin": KOTLIN_LANGUAGE, "Nim": NIM_LANGUAGE, "Ocaml": OCAML_LANGUAGE, - #"Objective-C": OBJECTIVE_C_LANGUAGE, + # "Objective-C": OBJECTIVE_C_LANGUAGE, "Perl": PERL_LANGUAGE, "Python": PY_LANGUAGE, "Qmljs": QMLJS_LANGUAGE, "Rust": RUST_LANGUAGE, "Scala": SCALA_LANGUAGE, - "TypeScript": TYPESCRIPT_LANGUAGE + "TypeScript": TYPESCRIPT_LANGUAGE, } self.uast_language_map = { - "Agda": 'agda', - "C": 'c', - "C#": 'c_sharp', - "C++": 'cpp', - "Cpp": 'cpp', - "D": 'd', - "Dart": 'dart', - "Elm" : 'elm', - "Go": 'go', - "Haskell": 'haskell', - "Java": 'java', - "JavaScript": 'js', - "Kotlin": 'kotlin', - "Nim": 'nim', - "Ocaml": 'ocaml', - #"Objective-C": 'objc', - "Perl": 'perl', - "Python": 'py', - "Qmljs": 'qmljs', - "Rust": 'rust', - "Scala": 'scala', - "TypeScript": 'typescript' + "Agda": "agda", + "C": "c", + "C#": "c_sharp", + "C++": "cpp", + "Cpp": "cpp", + "D": "d", + "Dart": "dart", + "Elm": "elm", + "Go": "go", + "Haskell": "haskell", + "Java": "java", + "JavaScript": "js", + "Kotlin": "kotlin", + "Nim": "nim", + "Ocaml": "ocaml", + # "Objective-C": 'objc', + "Perl": "perl", + "Python": "py", + "Qmljs": "qmljs", + "Rust": "rust", + "Scala": "scala", + "TypeScript": "typescript", } self.logger = get_logger(__name__) self.ruleset_file = os.path.dirname(os.path.abspath(__file__)) @@ -178,7 +183,7 @@ def ensure_tree_sitter_bindings(): # Raise an error if the file still doesn't exist if not os.path.exists(self.ikb_file): raise FileNotFoundError(f"File not found: {self.ikb_file}") - + # Check if the file exists; if not, update the default path if not os.path.exists(self.null_libs_file): print(f"File not found at {self.null_libs_file}. Updating to '../semantic-ruleset/null_libs.csv'") @@ -203,7 +208,7 @@ def get_uast_json(code, lang): # Create case-insensitive mappings language_map_lower = {key.lower(): value for key, value in self.language_map.items()} uast_language_map_lower = {key.lower(): value for key, value in self.uast_language_map.items()} - + # Check for the lowercase version of `lang` lang_lower = lang.lower() if lang_lower in language_map_lower: @@ -217,31 +222,31 @@ def get_uast_json(code, lang): def extract_packages_from_uast(uast_json): """Extract package names from the UAST JSON where node_type is 'uast_package'.""" package_list = [] - + try: uast_data = json.loads(uast_json) if uast_data is not None: nodes = uast_data.get("nodes", {}) else: nodes = {} - print("Warning: uast_data is None. Check the data source or initialization process.") - return + print("Warning: uast_data is None. Check the data source or initialization process.") + return # Iterate through nodes to find nodes with type 'uast_package' for node_id, node_data in nodes.items(): if node_data.get("node_type") == "uast_package": # Extract the package name from the 'code_snippet' (after 'uast_package ') package_name = node_data["code_snippet"].split(" ")[1] package_list.append(package_name) - + except json.JSONDecodeError as e: print(f"Failed to parse UAST JSON: {e}") - + return ",".join(package_list) # Return as a comma-separated string def get_uast_parquet(tmp_table): # df = pd.read_parquet(f'{db_path}/{filename}', 'pyarrow') # df = df.reindex(columns=all_columns) - + # Extract language and content arrays from the table using PyArrow print(self.language) lang_array = tmp_table.column(self.language) @@ -250,17 +255,20 @@ def get_uast_parquet(tmp_table): assert len(lang_array) == len(content_array) # Generate UASTs using a list comprehension - uasts = [json.dumps(get_uast_json(content_array[i].as_py(), lang_array[i].as_py())) for i in range(len(content_array))] + uasts = [ + json.dumps(get_uast_json(content_array[i].as_py(), lang_array[i].as_py())) + for i in range(len(content_array)) + ] # Extract package lists from the UAST column package_lists = [extract_packages_from_uast(uast) for uast in uasts] - + # Add the UAST array as a new column in the PyArrow table uast_column = pa.array(uasts) package_list_column = pa.array(package_lists) - tmp_table_with_uast = tmp_table.append_column('UAST', uast_column) + tmp_table_with_uast = tmp_table.append_column("UAST", uast_column) # Add the uast_package column - table_with_package_list = tmp_table_with_uast.append_column('UAST_Package_List', package_list_column) + table_with_package_list = tmp_table_with_uast.append_column("UAST_Package_List", package_list_column) return table_with_package_list table_with_uast = get_uast_parquet(table) @@ -275,23 +283,23 @@ def get_uast_parquet(tmp_table): ikb.load_ikb_trie() # Extract concept from IKB - libraries = table_with_uast.column('UAST_Package_List').to_pylist() - language = table_with_uast.column('language').to_pylist() + libraries = table_with_uast.column("UAST_Package_List").to_pylist() + language = table_with_uast.column("language").to_pylist() concepts = [concept_extractor(lib, lang, ikb) for lib, lang in zip(libraries, language)] - + # Append concepts column to table and record unknown libraries new_col = pa.array(concepts) - table_with_uast = table_with_uast.append_column('Concepts', new_col) + table_with_uast = table_with_uast.append_column("Concepts", new_col) ikb.write_null_files() # Higher order syntactic profiler self.logger.debug(f"Transforming one table with {len(table_with_uast)} rows") if self.metrics_list is not None: - uasts = [uast_read(uast_json) for uast_json in table_with_uast['UAST'].to_pylist()] + uasts = [uast_read(uast_json) for uast_json in table_with_uast["UAST"].to_pylist()] ccrs = [] - code_snippet_len = [] - avg_fn_len_in_snippet = [] + code_snippet_len = [] + avg_fn_len_in_snippet = [] for uast in uasts: if "CCR" in self.metrics_list: @@ -299,25 +307,30 @@ def get_uast_parquet(tmp_table): if "code_snippet_len" in self.metrics_list: code_snippet_len.append(extract_code_snippet_length(uast)) if "avg_fn_len_in_snippet" in self.metrics_list: - avg_fn_len_in_snippet.append(extract_code_avg_fn_len_in_snippet(uast)) + avg_fn_len_in_snippet.append(extract_code_avg_fn_len_in_snippet(uast)) if "CCR" in self.metrics_list: table_with_uast = table_with_uast.append_column("CCR", pa.array(ccrs)) if "code_snippet_len" in self.metrics_list: table_with_uast = table_with_uast.append_column("code_snippet_len", pa.array(code_snippet_len)) if "avg_fn_len_in_snippet" in self.metrics_list: - table_with_uast = table_with_uast.append_column("avg_fn_len_in_snippet", pa.array(avg_fn_len_in_snippet)) + table_with_uast = table_with_uast.append_column( + "avg_fn_len_in_snippet", pa.array(avg_fn_len_in_snippet) + ) self.logger.debug(f"Transformed one table with {len(table_with_uast)} rows") metadata = {"nfiles": 1, "nrows": len(table_with_uast)} # Report generation - if 'UAST' in table_with_uast.schema.names and 'Concepts' in table_with_uast.schema.names: - generate_report(table_with_uast,self.metrics_list) + if "UAST" in table_with_uast.schema.names and "Concepts" in table_with_uast.schema.names: + generate_report(table_with_uast, self.metrics_list) # Add some sample metadata. self.logger.debug(f"Transformed one table with {len(table_with_uast)} rows") - # report statistics - stats = {"source_documents": table.num_columns, "result_documents": table_with_uast.num_columns} + # report statistics + stats = { + "source_documents": table.num_columns, + "result_documents": table_with_uast.num_columns, + } return [table_with_uast], stats def clean_bindings(self): @@ -327,13 +340,15 @@ def clean_bindings(self): print(f"Successfully deleted: {self.bindings_dir}") except subprocess.CalledProcessError as e: print(f"Error deleting {self.bindings_dir}: {e}") - + + class CodeProfilerTransformConfiguration(TransformConfiguration): def __init__(self, transform_class: type[AbstractBinaryTransform] = CodeProfilerTransform): super().__init__( name=short_name, transform_class=transform_class, - ) + ) + def add_input_params(self, parser: ArgumentParser) -> None: parser.add_argument( f"--{language}", diff --git a/transforms/code/code_profiler/python/src/code_profiler_transform_python.py b/transforms/code/code_profiler/dpk_code_profiler/transform_python.py similarity index 97% rename from transforms/code/code_profiler/python/src/code_profiler_transform_python.py rename to transforms/code/code_profiler/dpk_code_profiler/transform_python.py index 8ebd297d2..197f6d93f 100644 --- a/transforms/code/code_profiler/python/src/code_profiler_transform_python.py +++ b/transforms/code/code_profiler/dpk_code_profiler/transform_python.py @@ -12,15 +12,16 @@ import time -from code_profiler_transform import ( - CodeProfilerTransform, - CodeProfilerTransformConfiguration, -) from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.runtime.pure_python.runtime_configuration import ( PythonTransformRuntimeConfiguration, ) from data_processing.utils import get_logger +from dpk_code_profiler.transform import ( + CodeProfilerTransform, + CodeProfilerTransformConfiguration, +) + logger = get_logger(__name__) @@ -39,6 +40,7 @@ def __init__(self): """ super().__init__(transform_config=CodeProfilerTransformConfiguration(transform_class=CodeProfilerTransform)) + if __name__ == "__main__": # launcher = NOOPRayLauncher() print("In code_profiler_transform_python") diff --git a/transforms/code/code_profiler/image.png b/transforms/code/code_profiler/image.png new file mode 100644 index 000000000..9a4cbfea2 Binary files /dev/null and b/transforms/code/code_profiler/image.png differ diff --git a/transforms/code/code_profiler/notebook_example/code-profiler.ipynb b/transforms/code/code_profiler/notebook_example/code-profiler.ipynb deleted file mode 100644 index 8bf992e61..000000000 --- a/transforms/code/code_profiler/notebook_example/code-profiler.ipynb +++ /dev/null @@ -1,255 +0,0 @@ -{ - "cells": [ - { - "attachments": { - "a6d2929d-7cdd-42ea-ae09-edfaf87698c0.png": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA0gAAAC7CAYAAABfCb7xAAAAAXNSR0IArs4c6QAAZIR0RVh0bXhm\naWxlACUzQ214ZmlsZSUyMGhvc3QlM0QlMjJFbGVjdHJvbiUyMiUyMG1vZGlmaWVkJTNEJTIyMjAy\nNC0xMC0xNFQwOCUzQTI4JTNBNTMuOTI0WiUyMiUyMGFnZW50JTNEJTIyTW96aWxsYSUyRjUuMCUy\nMChNYWNpbnRvc2glM0IlMjBJbnRlbCUyME1hYyUyME9TJTIwWCUyMDEwXzE1XzcpJTIwQXBwbGVX\nZWJLaXQlMkY1MzcuMzYlMjAoS0hUTUwlMkMlMjBsaWtlJTIwR2Vja28pJTIwZHJhdy5pbyUyRjIy\nLjAuMiUyMENocm9tZSUyRjExNC4wLjU3MzUuMjg5JTIwRWxlY3Ryb24lMkYyNS44LjQlMjBTYWZh\ncmklMkY1MzcuMzYlMjIlMjBldGFnJTNEJTIycG82bkFRUXJoajdaeEdRaFUxdVAlMjIlMjB2ZXJz\naW9uJTNEJTIyMjIuMC4yJTIyJTIwdHlwZSUzRCUyMmRldmljZSUyMiUzRSUwQSUyMCUyMCUzQ2Rp\nYWdyYW0lMjBuYW1lJTNEJTIyUGFnZS0xJTIyJTIwaWQlM0QlMjJZQTAzUVlyQjVyV1MtTjlVeGNt\nSiUyMiUzRSUwQSUyMCUyMCUyMCUyMCUzQ214R3JhcGhNb2RlbCUyMGR4JTNEJTIyMTAxMiUyMiUy\nMGR5JTNEJTIyNzc5JTIyJTIwZ3JpZCUzRCUyMjElMjIlMjBncmlkU2l6ZSUzRCUyMjEwJTIyJTIw\nZ3VpZGVzJTNEJTIyMSUyMiUyMHRvb2x0aXBzJTNEJTIyMSUyMiUyMGNvbm5lY3QlM0QlMjIxJTIy\nJTIwYXJyb3dzJTNEJTIyMSUyMiUyMGZvbGQlM0QlMjIxJTIyJTIwcGFnZSUzRCUyMjElMjIlMjBw\nYWdlU2NhbGUlM0QlMjIxJTIyJTIwcGFnZVdpZHRoJTNEJTIyODI3JTIyJTIwcGFnZUhlaWdodCUz\nRCUyMjExNjklMjIlMjBtYXRoJTNEJTIyMCUyMiUyMHNoYWRvdyUzRCUyMjAlMjIlM0UlMEElMjAl\nMjAlMjAlMjAlMjAlMjAlM0Nyb290JTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDbXhD\nZWxsJTIwaWQlM0QlMjIwJTIyJTIwJTJGJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTND\nbXhDZWxsJTIwaWQlM0QlMjIxJTIyJTIwcGFyZW50JTNEJTIyMCUyMiUyMCUyRiUzRSUwQSUyMCUy\nMCUyMCUyMCUyMCUyMCUyMCUyMCUzQ214Q2VsbCUyMGlkJTNEJTIyd0lDN1NyX3FaM2dVa2t1TDJ1\nSzktMSUyMiUyMHZhbHVlJTNEJTIyJTI2bHQlM0JiciUyNmd0JTNCJTI2bHQlM0JiciUyNmd0JTNC\nJTI2bHQlM0JiciUyNmd0JTNCJTI2bHQlM0Jmb250JTIwc3R5bGUlM0QlMjZxdW90JTNCZm9udC1z\naXplJTNBJTIwMThweCUzQiUyNnF1b3QlM0IlMjZndCUzQiUyNmx0JTNCYiUyNmd0JTNCQ29kZSUy\nMGRhdGElMjBwcm9maWxlciUyNmx0JTNCJTJGYiUyNmd0JTNCJTI2bHQlM0IlMkZmb250JTI2Z3Ql\nM0IlMjIlMjBzdHlsZSUzRCUyMnJvdW5kZWQlM0QwJTNCd2hpdGVTcGFjZSUzRHdyYXAlM0JodG1s\nJTNEMSUzQiUyMiUyMHZlcnRleCUzRCUyMjElMjIlMjBwYXJlbnQlM0QlMjIxJTIyJTNFJTBBJTIw\nJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDbXhHZW9tZXRyeSUyMHglM0QlMjIxMzAlMjIl\nMjB5JTNEJTIyMzYwJTIyJTIwd2lkdGglM0QlMjI1NjAlMjIlMjBoZWlnaHQlM0QlMjI2MyUyMiUy\nMGFzJTNEJTIyZ2VvbWV0cnklMjIlMjAlMkYlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAl\nM0MlMkZteENlbGwlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteENlbGwlMjBpZCUz\nRCUyMmN3T0djTjlvZUVuUGNod0EwR0VZLTUlMjIlMjBzdHlsZSUzRCUyMmVkZ2VTdHlsZSUzRG9y\ndGhvZ29uYWxFZGdlU3R5bGUlM0Jyb3VuZGVkJTNEMCUzQm9ydGhvZ29uYWxMb29wJTNEMSUzQmpl\ndHR5U2l6ZSUzRGF1dG8lM0JodG1sJTNEMSUzQmVudHJ5WCUzRDAlM0JlbnRyeVklM0QwLjUlM0Jl\nbnRyeUR4JTNEMCUzQmVudHJ5RHklM0QwJTNCJTIyJTIwcGFyZW50JTNEJTIyMSUyMiUyMHNvdXJj\nZSUzRCUyMmN3T0djTjlvZUVuUGNod0EwR0VZLTElMjIlMjB0YXJnZXQlM0QlMjJjd09HY045b2VF\nblBjaHdBMEdFWS0yJTIyJTIwZWRnZSUzRCUyMjElMjIlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAl\nMjAlMjAlMjAlMjAlM0NteEdlb21ldHJ5JTIwcmVsYXRpdmUlM0QlMjIxJTIyJTIwYXMlM0QlMjJn\nZW9tZXRyeSUyMiUyMCUyRiUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQyUyRm14Q2Vs\nbCUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQ214Q2VsbCUyMGlkJTNEJTIyY3dPR2NO\nOW9lRW5QY2h3QTBHRVktMSUyMiUyMHZhbHVlJTNEJTIyc3ludGFjdGljX2NvbmNlcHRfZXh0cmFj\ndG9yJTIyJTIwc3R5bGUlM0QlMjJyb3VuZGVkJTNEMCUzQndoaXRlU3BhY2UlM0R3cmFwJTNCaHRt\nbCUzRDElM0IlMjIlMjBwYXJlbnQlM0QlMjIxJTIyJTIwdmVydGV4JTNEJTIyMSUyMiUzRSUwQSUy\nMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQ214R2VvbWV0cnklMjB4JTNEJTIyMTQyJTIy\nJTIweSUzRCUyMjM3NSUyMiUyMHdpZHRoJTNEJTIyMTYwJTIyJTIwaGVpZ2h0JTNEJTIyMzAlMjIl\nMjBhcyUzRCUyMmdlb21ldHJ5JTIyJTIwJTJGJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIw\nJTNDJTJGbXhDZWxsJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDbXhDZWxsJTIwaWQl\nM0QlMjJjd09HY045b2VFblBjaHdBMEdFWS02JTIyJTIwc3R5bGUlM0QlMjJlZGdlU3R5bGUlM0Rv\ncnRob2dvbmFsRWRnZVN0eWxlJTNCcm91bmRlZCUzRDAlM0JvcnRob2dvbmFsTG9vcCUzRDElM0Jq\nZXR0eVNpemUlM0RhdXRvJTNCaHRtbCUzRDElM0JlbnRyeVglM0QwJTNCZW50cnlZJTNEMC41JTNC\nZW50cnlEeCUzRDAlM0JlbnRyeUR5JTNEMCUzQiUyMiUyMHBhcmVudCUzRCUyMjElMjIlMjBzb3Vy\nY2UlM0QlMjJjd09HY045b2VFblBjaHdBMEdFWS0yJTIyJTIwdGFyZ2V0JTNEJTIyY3dPR2NOOW9l\nRW5QY2h3QTBHRVktMyUyMiUyMGVkZ2UlM0QlMjIxJTIyJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIw\nJTIwJTIwJTIwJTIwJTNDbXhHZW9tZXRyeSUyMHJlbGF0aXZlJTNEJTIyMSUyMiUyMGFzJTNEJTIy\nZ2VvbWV0cnklMjIlMjAlMkYlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0MlMkZteENl\nbGwlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteENlbGwlMjBpZCUzRCUyMmN3T0dj\nTjlvZUVuUGNod0EwR0VZLTIlMjIlMjB2YWx1ZSUzRCUyMnNlbWFudGljX3Byb2ZpbGVyJTIyJTIw\nc3R5bGUlM0QlMjJyb3VuZGVkJTNEMCUzQndoaXRlU3BhY2UlM0R3cmFwJTNCaHRtbCUzRDElM0Il\nMjIlMjBwYXJlbnQlM0QlMjIxJTIyJTIwdmVydGV4JTNEJTIyMSUyMiUzRSUwQSUyMCUyMCUyMCUy\nMCUyMCUyMCUyMCUyMCUyMCUyMCUzQ214R2VvbWV0cnklMjB4JTNEJTIyMzQzJTIyJTIweSUzRCUy\nMjM3NSUyMiUyMHdpZHRoJTNEJTIyMTIwJTIyJTIwaGVpZ2h0JTNEJTIyMzAlMjIlMjBhcyUzRCUy\nMmdlb21ldHJ5JTIyJTIwJTJGJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDJTJGbXhD\nZWxsJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDbXhDZWxsJTIwaWQlM0QlMjJjd09H\nY045b2VFblBjaHdBMEdFWS0zJTIyJTIwdmFsdWUlM0QlMjJoaWdoZXJfb3JkZXJfc3ludGFjdGlj\nX3Byb2ZpbGVyJTIyJTIwc3R5bGUlM0QlMjJyb3VuZGVkJTNEMCUzQndoaXRlU3BhY2UlM0R3cmFw\nJTNCaHRtbCUzRDElM0IlMjIlMjBwYXJlbnQlM0QlMjIxJTIyJTIwdmVydGV4JTNEJTIyMSUyMiUz\nRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQ214R2VvbWV0cnklMjB4JTNEJTIy\nNTAxJTIyJTIweSUzRCUyMjM3NSUyMiUyMHdpZHRoJTNEJTIyMTcwJTIyJTIwaGVpZ2h0JTNEJTIy\nMzAlMjIlMjBhcyUzRCUyMmdlb21ldHJ5JTIyJTIwJTJGJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIw\nJTIwJTIwJTNDJTJGbXhDZWxsJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDbXhDZWxs\nJTIwaWQlM0QlMjJjd09HY045b2VFblBjaHdBMEdFWS03JTIyJTIwdmFsdWUlM0QlMjIlMjIlMjBz\ndHlsZSUzRCUyMnNoYXBlJTNEZmxleEFycm93JTNCZW5kQXJyb3clM0RjbGFzc2ljJTNCaHRtbCUz\nRDElM0Jyb3VuZGVkJTNEMCUzQmV4aXRYJTNEMSUzQmV4aXRZJTNEMC41JTNCZXhpdER4JTNEMCUz\nQmV4aXREeSUzRDAlM0IlMjIlMjBwYXJlbnQlM0QlMjIxJTIyJTIwZWRnZSUzRCUyMjElMjIlMjBz\nb3VyY2UlM0QlMjJjd09HY045b2VFblBjaHdBMEdFWS0yOSUyMiUzRSUwQSUyMCUyMCUyMCUyMCUy\nMCUyMCUyMCUyMCUyMCUyMCUzQ214R2VvbWV0cnklMjB3aWR0aCUzRCUyMjUwJTIyJTIwaGVpZ2h0\nJTNEJTIyNTAlMjIlMjByZWxhdGl2ZSUzRCUyMjElMjIlMjBhcyUzRCUyMmdlb21ldHJ5JTIyJTNF\nJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDbXhQb2ludCUyMHglM0Ql\nMjI3OSUyMiUyMHklM0QlMjIzOTAlMjIlMjBhcyUzRCUyMnNvdXJjZVBvaW50JTIyJTIwJTJGJTNF\nJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDbXhQb2ludCUyMHglM0Ql\nMjIxMzElMjIlMjB5JTNEJTIyMzkwJTIyJTIwYXMlM0QlMjJ0YXJnZXRQb2ludCUyMiUyMCUyRiUz\nRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQyUyRm14R2VvbWV0cnklM0UlMEEl\nMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0MlMkZteENlbGwlM0UlMEElMjAlMjAlMjAlMjAlMjAl\nMjAlMjAlMjAlM0NteENlbGwlMjBpZCUzRCUyMmN3T0djTjlvZUVuUGNod0EwR0VZLTEyJTIyJTIw\ndmFsdWUlM0QlMjIlMjIlMjBzdHlsZSUzRCUyMnNoYXBlJTNEdGFibGUlM0JzdGFydFNpemUlM0Qw\nJTNCY29udGFpbmVyJTNEMSUzQmNvbGxhcHNpYmxlJTNEMCUzQmNoaWxkTGF5b3V0JTNEdGFibGVM\nYXlvdXQlM0Jmb250U2l6ZSUzRDE2JTNCJTIyJTIwcGFyZW50JTNEJTIyMSUyMiUyMHZlcnRleCUz\nRCUyMjElMjIlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteEdlb21ldHJ5\nJTIweCUzRCUyMjQ1JTIyJTIweSUzRCUyMjI5MSUyMiUyMHdpZHRoJTNEJTIyMTMwJTIyJTIwaGVp\nZ2h0JTNEJTIyNjAlMjIlMjBhcyUzRCUyMmdlb21ldHJ5JTIyJTIwJTJGJTNFJTBBJTIwJTIwJTIw\nJTIwJTIwJTIwJTIwJTIwJTNDJTJGbXhDZWxsJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIw\nJTNDbXhDZWxsJTIwaWQlM0QlMjJjd09HY045b2VFblBjaHdBMEdFWS0xMyUyMiUyMHZhbHVlJTNE\nJTIyJTIyJTIwc3R5bGUlM0QlMjJzaGFwZSUzRHRhYmxlUm93JTNCaG9yaXpvbnRhbCUzRDAlM0Jz\ndGFydFNpemUlM0QwJTNCc3dpbWxhbmVIZWFkJTNEMCUzQnN3aW1sYW5lQm9keSUzRDAlM0JzdHJv\na2VDb2xvciUzRGluaGVyaXQlM0J0b3AlM0QwJTNCbGVmdCUzRDAlM0Jib3R0b20lM0QwJTNCcmln\naHQlM0QwJTNCY29sbGFwc2libGUlM0QwJTNCZHJvcFRhcmdldCUzRDAlM0JmaWxsQ29sb3IlM0Ru\nb25lJTNCcG9pbnRzJTNEJTVCJTVCMCUyQzAuNSU1RCUyQyU1QjElMkMwLjUlNUQlNUQlM0Jwb3J0\nQ29uc3RyYWludCUzRGVhc3R3ZXN0JTNCZm9udFNpemUlM0QxNiUzQiUyMiUyMHBhcmVudCUzRCUy\nMmN3T0djTjlvZUVuUGNod0EwR0VZLTEyJTIyJTIwdmVydGV4JTNEJTIyMSUyMiUzRSUwQSUyMCUy\nMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQ214R2VvbWV0cnklMjB3aWR0aCUzRCUyMjEzMCUy\nMiUyMGhlaWdodCUzRCUyMjIwJTIyJTIwYXMlM0QlMjJnZW9tZXRyeSUyMiUyMCUyRiUzRSUwQSUy\nMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQyUyRm14Q2VsbCUzRSUwQSUyMCUyMCUyMCUyMCUyMCUy\nMCUyMCUyMCUzQ214Q2VsbCUyMGlkJTNEJTIyY3dPR2NOOW9lRW5QY2h3QTBHRVktMTQlMjIlMjB2\nYWx1ZSUzRCUyMiUyNmx0JTNCZm9udCUyMHN0eWxlJTNEJTI2cXVvdCUzQmZvbnQtc2l6ZSUzQSUy\nMDEycHglM0IlMjZxdW90JTNCJTI2Z3QlM0JMYW5ndWFnZSUyNmx0JTNCJTJGZm9udCUyNmd0JTNC\nJTIyJTIwc3R5bGUlM0QlMjJzaGFwZSUzRHBhcnRpYWxSZWN0YW5nbGUlM0JodG1sJTNEMSUzQndo\naXRlU3BhY2UlM0R3cmFwJTNCY29ubmVjdGFibGUlM0QwJTNCc3Ryb2tlQ29sb3IlM0Rpbmhlcml0\nJTNCb3ZlcmZsb3clM0RoaWRkZW4lM0JmaWxsQ29sb3IlM0Rub25lJTNCdG9wJTNEMCUzQmxlZnQl\nM0QwJTNCYm90dG9tJTNEMCUzQnJpZ2h0JTNEMCUzQnBvaW50ZXJFdmVudHMlM0QxJTNCZm9udFNp\nemUlM0QxNiUzQiUyMiUyMHBhcmVudCUzRCUyMmN3T0djTjlvZUVuUGNod0EwR0VZLTEzJTIyJTIw\ndmVydGV4JTNEJTIyMSUyMiUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQ214\nR2VvbWV0cnklMjB3aWR0aCUzRCUyMjYwJTIyJTIwaGVpZ2h0JTNEJTIyMjAlMjIlMjBhcyUzRCUy\nMmdlb21ldHJ5JTIyJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTND\nbXhSZWN0YW5nbGUlMjB3aWR0aCUzRCUyMjYwJTIyJTIwaGVpZ2h0JTNEJTIyMjAlMjIlMjBhcyUz\nRCUyMmFsdGVybmF0ZUJvdW5kcyUyMiUyMCUyRiUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUy\nMCUyMCUyMCUzQyUyRm14R2VvbWV0cnklM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0Ml\nMkZteENlbGwlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteENlbGwlMjBpZCUzRCUy\nMmN3T0djTjlvZUVuUGNod0EwR0VZLTE1JTIyJTIwdmFsdWUlM0QlMjIlMjZsdCUzQmZvbnQlMjBz\ndHlsZSUzRCUyNnF1b3QlM0Jmb250LXNpemUlM0ElMjAxMnB4JTNCJTI2cXVvdCUzQiUyNmd0JTNC\nQ29udGVudCUyNmx0JTNCJTJGZm9udCUyNmd0JTNCJTIyJTIwc3R5bGUlM0QlMjJzaGFwZSUzRHBh\ncnRpYWxSZWN0YW5nbGUlM0JodG1sJTNEMSUzQndoaXRlU3BhY2UlM0R3cmFwJTNCY29ubmVjdGFi\nbGUlM0QwJTNCc3Ryb2tlQ29sb3IlM0Rpbmhlcml0JTNCb3ZlcmZsb3clM0RoaWRkZW4lM0JmaWxs\nQ29sb3IlM0Rub25lJTNCdG9wJTNEMCUzQmxlZnQlM0QwJTNCYm90dG9tJTNEMCUzQnJpZ2h0JTNE\nMCUzQnBvaW50ZXJFdmVudHMlM0QxJTNCZm9udFNpemUlM0QxNiUzQiUyMiUyMHBhcmVudCUzRCUy\nMmN3T0djTjlvZUVuUGNod0EwR0VZLTEzJTIyJTIwdmVydGV4JTNEJTIyMSUyMiUzRSUwQSUyMCUy\nMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQ214R2VvbWV0cnklMjB4JTNEJTIyNjAlMjIlMjB3\naWR0aCUzRCUyMjQ4JTIyJTIwaGVpZ2h0JTNEJTIyMjAlMjIlMjBhcyUzRCUyMmdlb21ldHJ5JTIy\nJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDbXhSZWN0YW5nbGUl\nMjB3aWR0aCUzRCUyMjQ4JTIyJTIwaGVpZ2h0JTNEJTIyMjAlMjIlMjBhcyUzRCUyMmFsdGVybmF0\nZUJvdW5kcyUyMiUyMCUyRiUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQyUy\nRm14R2VvbWV0cnklM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0MlMkZteENlbGwlM0Ul\nMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteENlbGwlMjBpZCUzRCUyMmN3T0djTjlvZUVu\nUGNod0EwR0VZLTE2JTIyJTIwdmFsdWUlM0QlMjIuLi4lMjIlMjBzdHlsZSUzRCUyMnNoYXBlJTNE\ncGFydGlhbFJlY3RhbmdsZSUzQmh0bWwlM0QxJTNCd2hpdGVTcGFjZSUzRHdyYXAlM0Jjb25uZWN0\nYWJsZSUzRDAlM0JzdHJva2VDb2xvciUzRGluaGVyaXQlM0JvdmVyZmxvdyUzRGhpZGRlbiUzQmZp\nbGxDb2xvciUzRG5vbmUlM0J0b3AlM0QwJTNCbGVmdCUzRDAlM0Jib3R0b20lM0QwJTNCcmlnaHQl\nM0QwJTNCcG9pbnRlckV2ZW50cyUzRDElM0Jmb250U2l6ZSUzRDE2JTNCJTIyJTIwcGFyZW50JTNE\nJTIyY3dPR2NOOW9lRW5QY2h3QTBHRVktMTMlMjIlMjB2ZXJ0ZXglM0QlMjIxJTIyJTNFJTBBJTIw\nJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDbXhHZW9tZXRyeSUyMHglM0QlMjIxMDglMjIl\nMjB3aWR0aCUzRCUyMjIyJTIyJTIwaGVpZ2h0JTNEJTIyMjAlMjIlMjBhcyUzRCUyMmdlb21ldHJ5\nJTIyJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDbXhSZWN0YW5n\nbGUlMjB3aWR0aCUzRCUyMjIyJTIyJTIwaGVpZ2h0JTNEJTIyMjAlMjIlMjBhcyUzRCUyMmFsdGVy\nbmF0ZUJvdW5kcyUyMiUyMCUyRiUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUz\nQyUyRm14R2VvbWV0cnklM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0MlMkZteENlbGwl\nM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteENlbGwlMjBpZCUzRCUyMmN3T0djTjlv\nZUVuUGNod0EwR0VZLTE3JTIyJTIwdmFsdWUlM0QlMjIlMjIlMjBzdHlsZSUzRCUyMnNoYXBlJTNE\ndGFibGVSb3clM0Job3Jpem9udGFsJTNEMCUzQnN0YXJ0U2l6ZSUzRDAlM0Jzd2ltbGFuZUhlYWQl\nM0QwJTNCc3dpbWxhbmVCb2R5JTNEMCUzQnN0cm9rZUNvbG9yJTNEaW5oZXJpdCUzQnRvcCUzRDAl\nM0JsZWZ0JTNEMCUzQmJvdHRvbSUzRDAlM0JyaWdodCUzRDAlM0Jjb2xsYXBzaWJsZSUzRDAlM0Jk\ncm9wVGFyZ2V0JTNEMCUzQmZpbGxDb2xvciUzRG5vbmUlM0Jwb2ludHMlM0QlNUIlNUIwJTJDMC41\nJTVEJTJDJTVCMSUyQzAuNSU1RCU1RCUzQnBvcnRDb25zdHJhaW50JTNEZWFzdHdlc3QlM0Jmb250\nU2l6ZSUzRDE2JTNCJTIyJTIwcGFyZW50JTNEJTIyY3dPR2NOOW9lRW5QY2h3QTBHRVktMTIlMjIl\nMjB2ZXJ0ZXglM0QlMjIxJTIyJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTND\nbXhHZW9tZXRyeSUyMHklM0QlMjIyMCUyMiUyMHdpZHRoJTNEJTIyMTMwJTIyJTIwaGVpZ2h0JTNE\nJTIyMjAlMjIlMjBhcyUzRCUyMmdlb21ldHJ5JTIyJTIwJTJGJTNFJTBBJTIwJTIwJTIwJTIwJTIw\nJTIwJTIwJTIwJTNDJTJGbXhDZWxsJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDbXhD\nZWxsJTIwaWQlM0QlMjJjd09HY045b2VFblBjaHdBMEdFWS0xOCUyMiUyMHZhbHVlJTNEJTIyJTIy\nJTIwc3R5bGUlM0QlMjJzaGFwZSUzRHBhcnRpYWxSZWN0YW5nbGUlM0JodG1sJTNEMSUzQndoaXRl\nU3BhY2UlM0R3cmFwJTNCY29ubmVjdGFibGUlM0QwJTNCc3Ryb2tlQ29sb3IlM0Rpbmhlcml0JTNC\nb3ZlcmZsb3clM0RoaWRkZW4lM0JmaWxsQ29sb3IlM0Rub25lJTNCdG9wJTNEMCUzQmxlZnQlM0Qw\nJTNCYm90dG9tJTNEMCUzQnJpZ2h0JTNEMCUzQnBvaW50ZXJFdmVudHMlM0QxJTNCZm9udFNpemUl\nM0QxNiUzQiUyMiUyMHBhcmVudCUzRCUyMmN3T0djTjlvZUVuUGNod0EwR0VZLTE3JTIyJTIwdmVy\ndGV4JTNEJTIyMSUyMiUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQ214R2Vv\nbWV0cnklMjB3aWR0aCUzRCUyMjYwJTIyJTIwaGVpZ2h0JTNEJTIyMjAlMjIlMjBhcyUzRCUyMmdl\nb21ldHJ5JTIyJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDbXhS\nZWN0YW5nbGUlMjB3aWR0aCUzRCUyMjYwJTIyJTIwaGVpZ2h0JTNEJTIyMjAlMjIlMjBhcyUzRCUy\nMmFsdGVybmF0ZUJvdW5kcyUyMiUyMCUyRiUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUy\nMCUyMCUzQyUyRm14R2VvbWV0cnklM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0MlMkZt\neENlbGwlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteENlbGwlMjBpZCUzRCUyMmN3\nT0djTjlvZUVuUGNod0EwR0VZLTE5JTIyJTIwdmFsdWUlM0QlMjIlMjIlMjBzdHlsZSUzRCUyMnNo\nYXBlJTNEcGFydGlhbFJlY3RhbmdsZSUzQmh0bWwlM0QxJTNCd2hpdGVTcGFjZSUzRHdyYXAlM0Jj\nb25uZWN0YWJsZSUzRDAlM0JzdHJva2VDb2xvciUzRGluaGVyaXQlM0JvdmVyZmxvdyUzRGhpZGRl\nbiUzQmZpbGxDb2xvciUzRG5vbmUlM0J0b3AlM0QwJTNCbGVmdCUzRDAlM0Jib3R0b20lM0QwJTNC\ncmlnaHQlM0QwJTNCcG9pbnRlckV2ZW50cyUzRDElM0Jmb250U2l6ZSUzRDE2JTNCJTIyJTIwcGFy\nZW50JTNEJTIyY3dPR2NOOW9lRW5QY2h3QTBHRVktMTclMjIlMjB2ZXJ0ZXglM0QlMjIxJTIyJTNF\nJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDbXhHZW9tZXRyeSUyMHglM0QlMjI2\nMCUyMiUyMHdpZHRoJTNEJTIyNDglMjIlMjBoZWlnaHQlM0QlMjIyMCUyMiUyMGFzJTNEJTIyZ2Vv\nbWV0cnklMjIlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteFJl\nY3RhbmdsZSUyMHdpZHRoJTNEJTIyNDglMjIlMjBoZWlnaHQlM0QlMjIyMCUyMiUyMGFzJTNEJTIy\nYWx0ZXJuYXRlQm91bmRzJTIyJTIwJTJGJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIw\nJTIwJTNDJTJGbXhHZW9tZXRyeSUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQyUyRm14\nQ2VsbCUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQ214Q2VsbCUyMGlkJTNEJTIyY3dP\nR2NOOW9lRW5QY2h3QTBHRVktMjAlMjIlMjB2YWx1ZSUzRCUyMiUyMiUyMHN0eWxlJTNEJTIyc2hh\ncGUlM0RwYXJ0aWFsUmVjdGFuZ2xlJTNCaHRtbCUzRDElM0J3aGl0ZVNwYWNlJTNEd3JhcCUzQmNv\nbm5lY3RhYmxlJTNEMCUzQnN0cm9rZUNvbG9yJTNEaW5oZXJpdCUzQm92ZXJmbG93JTNEaGlkZGVu\nJTNCZmlsbENvbG9yJTNEbm9uZSUzQnRvcCUzRDAlM0JsZWZ0JTNEMCUzQmJvdHRvbSUzRDAlM0Jy\naWdodCUzRDAlM0Jwb2ludGVyRXZlbnRzJTNEMSUzQmZvbnRTaXplJTNEMTYlM0IlMjIlMjBwYXJl\nbnQlM0QlMjJjd09HY045b2VFblBjaHdBMEdFWS0xNyUyMiUyMHZlcnRleCUzRCUyMjElMjIlM0Ul\nMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteEdlb21ldHJ5JTIweCUzRCUyMjEw\nOCUyMiUyMHdpZHRoJTNEJTIyMjIlMjIlMjBoZWlnaHQlM0QlMjIyMCUyMiUyMGFzJTNEJTIyZ2Vv\nbWV0cnklMjIlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteFJl\nY3RhbmdsZSUyMHdpZHRoJTNEJTIyMjIlMjIlMjBoZWlnaHQlM0QlMjIyMCUyMiUyMGFzJTNEJTIy\nYWx0ZXJuYXRlQm91bmRzJTIyJTIwJTJGJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIw\nJTIwJTNDJTJGbXhHZW9tZXRyeSUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQyUyRm14\nQ2VsbCUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQ214Q2VsbCUyMGlkJTNEJTIyY3dP\nR2NOOW9lRW5QY2h3QTBHRVktMjElMjIlMjB2YWx1ZSUzRCUyMiUyMiUyMHN0eWxlJTNEJTIyc2hh\ncGUlM0R0YWJsZVJvdyUzQmhvcml6b250YWwlM0QwJTNCc3RhcnRTaXplJTNEMCUzQnN3aW1sYW5l\nSGVhZCUzRDAlM0Jzd2ltbGFuZUJvZHklM0QwJTNCc3Ryb2tlQ29sb3IlM0Rpbmhlcml0JTNCdG9w\nJTNEMCUzQmxlZnQlM0QwJTNCYm90dG9tJTNEMCUzQnJpZ2h0JTNEMCUzQmNvbGxhcHNpYmxlJTNE\nMCUzQmRyb3BUYXJnZXQlM0QwJTNCZmlsbENvbG9yJTNEbm9uZSUzQnBvaW50cyUzRCU1QiU1QjAl\nMkMwLjUlNUQlMkMlNUIxJTJDMC41JTVEJTVEJTNCcG9ydENvbnN0cmFpbnQlM0RlYXN0d2VzdCUz\nQmZvbnRTaXplJTNEMTYlM0IlMjIlMjBwYXJlbnQlM0QlMjJjd09HY045b2VFblBjaHdBMEdFWS0x\nMiUyMiUyMHZlcnRleCUzRCUyMjElMjIlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAl\nMjAlM0NteEdlb21ldHJ5JTIweSUzRCUyMjQwJTIyJTIwd2lkdGglM0QlMjIxMzAlMjIlMjBoZWln\naHQlM0QlMjIyMCUyMiUyMGFzJTNEJTIyZ2VvbWV0cnklMjIlMjAlMkYlM0UlMEElMjAlMjAlMjAl\nMjAlMjAlMjAlMjAlMjAlM0MlMkZteENlbGwlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAl\nM0NteENlbGwlMjBpZCUzRCUyMmN3T0djTjlvZUVuUGNod0EwR0VZLTIyJTIyJTIwdmFsdWUlM0Ql\nMjIlMjIlMjBzdHlsZSUzRCUyMnNoYXBlJTNEcGFydGlhbFJlY3RhbmdsZSUzQmh0bWwlM0QxJTNC\nd2hpdGVTcGFjZSUzRHdyYXAlM0Jjb25uZWN0YWJsZSUzRDAlM0JzdHJva2VDb2xvciUzRGluaGVy\naXQlM0JvdmVyZmxvdyUzRGhpZGRlbiUzQmZpbGxDb2xvciUzRG5vbmUlM0J0b3AlM0QwJTNCbGVm\ndCUzRDAlM0Jib3R0b20lM0QwJTNCcmlnaHQlM0QwJTNCcG9pbnRlckV2ZW50cyUzRDElM0Jmb250\nU2l6ZSUzRDE2JTNCJTIyJTIwcGFyZW50JTNEJTIyY3dPR2NOOW9lRW5QY2h3QTBHRVktMjElMjIl\nMjB2ZXJ0ZXglM0QlMjIxJTIyJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTND\nbXhHZW9tZXRyeSUyMHdpZHRoJTNEJTIyNjAlMjIlMjBoZWlnaHQlM0QlMjIyMCUyMiUyMGFzJTNE\nJTIyZ2VvbWV0cnklMjIlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAl\nM0NteFJlY3RhbmdsZSUyMHdpZHRoJTNEJTIyNjAlMjIlMjBoZWlnaHQlM0QlMjIyMCUyMiUyMGFz\nJTNEJTIyYWx0ZXJuYXRlQm91bmRzJTIyJTIwJTJGJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIw\nJTIwJTIwJTIwJTNDJTJGbXhHZW9tZXRyeSUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUz\nQyUyRm14Q2VsbCUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQ214Q2VsbCUyMGlkJTNE\nJTIyY3dPR2NOOW9lRW5QY2h3QTBHRVktMjMlMjIlMjB2YWx1ZSUzRCUyMiUyMiUyMHN0eWxlJTNE\nJTIyc2hhcGUlM0RwYXJ0aWFsUmVjdGFuZ2xlJTNCaHRtbCUzRDElM0J3aGl0ZVNwYWNlJTNEd3Jh\ncCUzQmNvbm5lY3RhYmxlJTNEMCUzQnN0cm9rZUNvbG9yJTNEaW5oZXJpdCUzQm92ZXJmbG93JTNE\naGlkZGVuJTNCZmlsbENvbG9yJTNEbm9uZSUzQnRvcCUzRDAlM0JsZWZ0JTNEMCUzQmJvdHRvbSUz\nRDAlM0JyaWdodCUzRDAlM0Jwb2ludGVyRXZlbnRzJTNEMSUzQmZvbnRTaXplJTNEMTYlM0IlMjIl\nMjBwYXJlbnQlM0QlMjJjd09HY045b2VFblBjaHdBMEdFWS0yMSUyMiUyMHZlcnRleCUzRCUyMjEl\nMjIlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteEdlb21ldHJ5JTIweCUz\nRCUyMjYwJTIyJTIwd2lkdGglM0QlMjI0OCUyMiUyMGhlaWdodCUzRCUyMjIwJTIyJTIwYXMlM0Ql\nMjJnZW9tZXRyeSUyMiUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUz\nQ214UmVjdGFuZ2xlJTIwd2lkdGglM0QlMjI0OCUyMiUyMGhlaWdodCUzRCUyMjIwJTIyJTIwYXMl\nM0QlMjJhbHRlcm5hdGVCb3VuZHMlMjIlMjAlMkYlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAl\nMjAlMjAlMjAlM0MlMkZteEdlb21ldHJ5JTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTND\nJTJGbXhDZWxsJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDbXhDZWxsJTIwaWQlM0Ql\nMjJjd09HY045b2VFblBjaHdBMEdFWS0yNCUyMiUyMHZhbHVlJTNEJTIyJTIyJTIwc3R5bGUlM0Ql\nMjJzaGFwZSUzRHBhcnRpYWxSZWN0YW5nbGUlM0JodG1sJTNEMSUzQndoaXRlU3BhY2UlM0R3cmFw\nJTNCY29ubmVjdGFibGUlM0QwJTNCc3Ryb2tlQ29sb3IlM0Rpbmhlcml0JTNCb3ZlcmZsb3clM0Ro\naWRkZW4lM0JmaWxsQ29sb3IlM0Rub25lJTNCdG9wJTNEMCUzQmxlZnQlM0QwJTNCYm90dG9tJTNE\nMCUzQnJpZ2h0JTNEMCUzQnBvaW50ZXJFdmVudHMlM0QxJTNCZm9udFNpemUlM0QxNiUzQiUyMiUy\nMHBhcmVudCUzRCUyMmN3T0djTjlvZUVuUGNod0EwR0VZLTIxJTIyJTIwdmVydGV4JTNEJTIyMSUy\nMiUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQ214R2VvbWV0cnklMjB4JTNE\nJTIyMTA4JTIyJTIwd2lkdGglM0QlMjIyMiUyMiUyMGhlaWdodCUzRCUyMjIwJTIyJTIwYXMlM0Ql\nMjJnZW9tZXRyeSUyMiUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUz\nQ214UmVjdGFuZ2xlJTIwd2lkdGglM0QlMjIyMiUyMiUyMGhlaWdodCUzRCUyMjIwJTIyJTIwYXMl\nM0QlMjJhbHRlcm5hdGVCb3VuZHMlMjIlMjAlMkYlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAl\nMjAlMjAlMjAlM0MlMkZteEdlb21ldHJ5JTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTND\nJTJGbXhDZWxsJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDbXhDZWxsJTIwaWQlM0Ql\nMjJjd09HY045b2VFblBjaHdBMEdFWS0yOSUyMiUyMHZhbHVlJTNEJTIySW5wdXQlMjBwYXJxdWV0\nJTIwZmlsZSUyMiUyMHN0eWxlJTNEJTIydGV4dCUzQmh0bWwlM0QxJTNCc3Ryb2tlQ29sb3IlM0Ru\nb25lJTNCZmlsbENvbG9yJTNEbm9uZSUzQmFsaWduJTNEY2VudGVyJTNCdmVydGljYWxBbGlnbiUz\nRG1pZGRsZSUzQndoaXRlU3BhY2UlM0R3cmFwJTNCcm91bmRlZCUzRDAlM0IlMjIlMjBwYXJlbnQl\nM0QlMjIxJTIyJTIwdmVydGV4JTNEJTIyMSUyMiUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUy\nMCUyMCUyMCUzQ214R2VvbWV0cnklMjB4JTNEJTIyMzAlMjIlMjB5JTNEJTIyMzc1JTIyJTIwd2lk\ndGglM0QlMjI2MCUyMiUyMGhlaWdodCUzRCUyMjMwJTIyJTIwYXMlM0QlMjJnZW9tZXRyeSUyMiUy\nMCUyRiUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQyUyRm14Q2VsbCUzRSUwQSUyMCUy\nMCUyMCUyMCUyMCUyMCUyMCUyMCUzQ214Q2VsbCUyMGlkJTNEJTIyY3dPR2NOOW9lRW5QY2h3QTBH\nRVktMzAlMjIlMjB2YWx1ZSUzRCUyMiUyMiUyMHN0eWxlJTNEJTIyc2hhcGUlM0RmbGV4QXJyb3cl\nM0JlbmRBcnJvdyUzRGNsYXNzaWMlM0JodG1sJTNEMSUzQnJvdW5kZWQlM0QwJTNCJTIyJTIwcGFy\nZW50JTNEJTIyMSUyMiUyMGVkZ2UlM0QlMjIxJTIyJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIw\nJTIwJTIwJTIwJTNDbXhHZW9tZXRyeSUyMHdpZHRoJTNEJTIyNTAlMjIlMjBoZWlnaHQlM0QlMjI1\nMCUyMiUyMHJlbGF0aXZlJTNEJTIyMSUyMiUyMGFzJTNEJTIyZ2VvbWV0cnklMjIlM0UlMEElMjAl\nMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteFBvaW50JTIweCUzRCUyMjY4OSUy\nMiUyMHklM0QlMjIzODkuNSUyMiUyMGFzJTNEJTIyc291cmNlUG9pbnQlMjIlMjAlMkYlM0UlMEEl\nMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteFBvaW50JTIweCUzRCUyMjcz\nMSUyMiUyMHklM0QlMjIzOTAlMjIlMjBhcyUzRCUyMnRhcmdldFBvaW50JTIyJTIwJTJGJTNFJTBB\nJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDJTJGbXhHZW9tZXRyeSUzRSUwQSUyMCUy\nMCUyMCUyMCUyMCUyMCUyMCUyMCUzQyUyRm14Q2VsbCUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUy\nMCUyMCUzQ214Q2VsbCUyMGlkJTNEJTIyY3dPR2NOOW9lRW5QY2h3QTBHRVktMzElMjIlMjB2YWx1\nZSUzRCUyMk1ldHJpYyUyMFJlcG9ydCUyNmx0JTNCYnIlMjZndCUzQihKU09OJTJDJTIwaHRtbCkl\nMjIlMjBzdHlsZSUzRCUyMnN3aW1sYW5lJTNCZm9udFN0eWxlJTNEMCUzQmNoaWxkTGF5b3V0JTNE\nc3RhY2tMYXlvdXQlM0Job3Jpem9udGFsJTNEMSUzQnN0YXJ0U2l6ZSUzRDMwJTNCaG9yaXpvbnRh\nbFN0YWNrJTNEMCUzQnJlc2l6ZVBhcmVudCUzRDElM0JyZXNpemVQYXJlbnRNYXglM0QwJTNCcmVz\naXplTGFzdCUzRDAlM0Jjb2xsYXBzaWJsZSUzRDElM0JtYXJnaW5Cb3R0b20lM0QwJTNCd2hpdGVT\ncGFjZSUzRHdyYXAlM0JodG1sJTNEMSUzQiUyMiUyMHBhcmVudCUzRCUyMjElMjIlMjB2ZXJ0ZXgl\nM0QlMjIxJTIyJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDbXhHZW9tZXRy\neSUyMHglM0QlMjI3MzAlMjIlMjB5JTNEJTIyNDA1JTIyJTIwd2lkdGglM0QlMjI4MCUyMiUyMGhl\naWdodCUzRCUyMjYwJTIyJTIwYXMlM0QlMjJnZW9tZXRyeSUyMiUyMCUyRiUzRSUwQSUyMCUyMCUy\nMCUyMCUyMCUyMCUyMCUyMCUzQyUyRm14Q2VsbCUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUy\nMCUzQ214Q2VsbCUyMGlkJTNEJTIyY3dPR2NOOW9lRW5QY2h3QTBHRVktMzMlMjIlMjB2YWx1ZSUz\nRCUyMk1ldHJpYyUyMDElMkMlMjAyJTJDJTIwMyUyMiUyMHN0eWxlJTNEJTIydGV4dCUzQnN0cm9r\nZUNvbG9yJTNEbm9uZSUzQmZpbGxDb2xvciUzRG5vbmUlM0JhbGlnbiUzRGxlZnQlM0J2ZXJ0aWNh\nbEFsaWduJTNEbWlkZGxlJTNCc3BhY2luZ0xlZnQlM0Q0JTNCc3BhY2luZ1JpZ2h0JTNENCUzQm92\nZXJmbG93JTNEaGlkZGVuJTNCcG9pbnRzJTNEJTVCJTVCMCUyQzAuNSU1RCUyQyU1QjElMkMwLjUl\nNUQlNUQlM0Jwb3J0Q29uc3RyYWludCUzRGVhc3R3ZXN0JTNCcm90YXRhYmxlJTNEMCUzQndoaXRl\nU3BhY2UlM0R3cmFwJTNCaHRtbCUzRDElM0IlMjIlMjBwYXJlbnQlM0QlMjJjd09HY045b2VFblBj\naHdBMEdFWS0zMSUyMiUyMHZlcnRleCUzRCUyMjElMjIlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAl\nMjAlMjAlMjAlMjAlM0NteEdlb21ldHJ5JTIweSUzRCUyMjMwJTIyJTIwd2lkdGglM0QlMjI4MCUy\nMiUyMGhlaWdodCUzRCUyMjMwJTIyJTIwYXMlM0QlMjJnZW9tZXRyeSUyMiUyMCUyRiUzRSUwQSUy\nMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQyUyRm14Q2VsbCUzRSUwQSUyMCUyMCUyMCUyMCUyMCUy\nMCUyMCUyMCUzQ214Q2VsbCUyMGlkJTNEJTIyY3dPR2NOOW9lRW5QY2h3QTBHRVktMzUlMjIlMjB2\nYWx1ZSUzRCUyMiUyMiUyMHN0eWxlJTNEJTIyc2hhcGUlM0R0YWJsZSUzQnN0YXJ0U2l6ZSUzRDAl\nM0Jjb250YWluZXIlM0QxJTNCY29sbGFwc2libGUlM0QwJTNCY2hpbGRMYXlvdXQlM0R0YWJsZUxh\neW91dCUzQmZvbnRTaXplJTNEMTYlM0IlMjIlMjBwYXJlbnQlM0QlMjIxJTIyJTIwdmVydGV4JTNE\nJTIyMSUyMiUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQ214R2VvbWV0cnkl\nMjB4JTNEJTIyNjc5JTIyJTIweSUzRCUyMjI3OSUyMiUyMHdpZHRoJTNEJTIyMTkwJTIyJTIwaGVp\nZ2h0JTNEJTIyODAlMjIlMjBhcyUzRCUyMmdlb21ldHJ5JTIyJTIwJTJGJTNFJTBBJTIwJTIwJTIw\nJTIwJTIwJTIwJTIwJTIwJTNDJTJGbXhDZWxsJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIw\nJTNDbXhDZWxsJTIwaWQlM0QlMjJjd09HY045b2VFblBjaHdBMEdFWS0zNiUyMiUyMHZhbHVlJTNE\nJTIyJTIyJTIwc3R5bGUlM0QlMjJzaGFwZSUzRHRhYmxlUm93JTNCaG9yaXpvbnRhbCUzRDAlM0Jz\ndGFydFNpemUlM0QwJTNCc3dpbWxhbmVIZWFkJTNEMCUzQnN3aW1sYW5lQm9keSUzRDAlM0JzdHJv\na2VDb2xvciUzRGluaGVyaXQlM0J0b3AlM0QwJTNCbGVmdCUzRDAlM0Jib3R0b20lM0QwJTNCcmln\naHQlM0QwJTNCY29sbGFwc2libGUlM0QwJTNCZHJvcFRhcmdldCUzRDAlM0JmaWxsQ29sb3IlM0Ru\nb25lJTNCcG9pbnRzJTNEJTVCJTVCMCUyQzAuNSU1RCUyQyU1QjElMkMwLjUlNUQlNUQlM0Jwb3J0\nQ29uc3RyYWludCUzRGVhc3R3ZXN0JTNCZm9udFNpemUlM0QxNiUzQiUyMiUyMHBhcmVudCUzRCUy\nMmN3T0djTjlvZUVuUGNod0EwR0VZLTM1JTIyJTIwdmVydGV4JTNEJTIyMSUyMiUzRSUwQSUyMCUy\nMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQ214R2VvbWV0cnklMjB3aWR0aCUzRCUyMjE5MCUy\nMiUyMGhlaWdodCUzRCUyMjQwJTIyJTIwYXMlM0QlMjJnZW9tZXRyeSUyMiUyMCUyRiUzRSUwQSUy\nMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQyUyRm14Q2VsbCUzRSUwQSUyMCUyMCUyMCUyMCUyMCUy\nMCUyMCUyMCUzQ214Q2VsbCUyMGlkJTNEJTIyY3dPR2NOOW9lRW5QY2h3QTBHRVktMzclMjIlMjB2\nYWx1ZSUzRCUyMiUyNmx0JTNCZm9udCUyMHN0eWxlJTNEJTI2cXVvdCUzQmZvbnQtc2l6ZSUzQSUy\nMDEycHglM0IlMjZxdW90JTNCJTI2Z3QlM0IuLi4lMjZsdCUzQiUyRmZvbnQlMjZndCUzQiUyMiUy\nMHN0eWxlJTNEJTIyc2hhcGUlM0RwYXJ0aWFsUmVjdGFuZ2xlJTNCaHRtbCUzRDElM0J3aGl0ZVNw\nYWNlJTNEd3JhcCUzQmNvbm5lY3RhYmxlJTNEMCUzQnN0cm9rZUNvbG9yJTNEaW5oZXJpdCUzQm92\nZXJmbG93JTNEaGlkZGVuJTNCZmlsbENvbG9yJTNEbm9uZSUzQnRvcCUzRDAlM0JsZWZ0JTNEMCUz\nQmJvdHRvbSUzRDAlM0JyaWdodCUzRDAlM0Jwb2ludGVyRXZlbnRzJTNEMSUzQmZvbnRTaXplJTNE\nMTYlM0IlMjIlMjBwYXJlbnQlM0QlMjJjd09HY045b2VFblBjaHdBMEdFWS0zNiUyMiUyMHZlcnRl\neCUzRCUyMjElMjIlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteEdlb21l\ndHJ5JTIwd2lkdGglM0QlMjIyMCUyMiUyMGhlaWdodCUzRCUyMjQwJTIyJTIwYXMlM0QlMjJnZW9t\nZXRyeSUyMiUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQ214UmVj\ndGFuZ2xlJTIwd2lkdGglM0QlMjIyMCUyMiUyMGhlaWdodCUzRCUyMjQwJTIyJTIwYXMlM0QlMjJh\nbHRlcm5hdGVCb3VuZHMlMjIlMjAlMkYlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAl\nMjAlM0MlMkZteEdlb21ldHJ5JTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDJTJGbXhD\nZWxsJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDbXhDZWxsJTIwaWQlM0QlMjJjd09H\nY045b2VFblBjaHdBMEdFWS0zOCUyMiUyMHZhbHVlJTNEJTIyJTI2bHQlM0Jmb250JTIwc3R5bGUl\nM0QlMjZxdW90JTNCZm9udC1zaXplJTNBJTIwMTJweCUzQiUyNnF1b3QlM0IlMjZndCUzQlVBU1Ql\nMjZsdCUzQmJyJTIwc3R5bGUlM0QlMjZxdW90JTNCYm9yZGVyLWNvbG9yJTNBJTIwdmFyKC0tYm9y\nZGVyLWNvbG9yKSUzQiUyNnF1b3QlM0IlMjZndCUzQmNvbHVtbnMlMjZsdCUzQiUyRmZvbnQlMjZn\ndCUzQiUyMiUyMHN0eWxlJTNEJTIyc2hhcGUlM0RwYXJ0aWFsUmVjdGFuZ2xlJTNCaHRtbCUzRDEl\nM0J3aGl0ZVNwYWNlJTNEd3JhcCUzQmNvbm5lY3RhYmxlJTNEMCUzQnN0cm9rZUNvbG9yJTNEaW5o\nZXJpdCUzQm92ZXJmbG93JTNEaGlkZGVuJTNCZmlsbENvbG9yJTNEbm9uZSUzQnRvcCUzRDAlM0Js\nZWZ0JTNEMCUzQmJvdHRvbSUzRDAlM0JyaWdodCUzRDAlM0Jwb2ludGVyRXZlbnRzJTNEMSUzQmZv\nbnRTaXplJTNEMTYlM0IlMjIlMjBwYXJlbnQlM0QlMjJjd09HY045b2VFblBjaHdBMEdFWS0zNiUy\nMiUyMHZlcnRleCUzRCUyMjElMjIlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAl\nM0NteEdlb21ldHJ5JTIweCUzRCUyMjIwJTIyJTIwd2lkdGglM0QlMjI1MCUyMiUyMGhlaWdodCUz\nRCUyMjQwJTIyJTIwYXMlM0QlMjJnZW9tZXRyeSUyMiUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUy\nMCUyMCUyMCUyMCUyMCUyMCUzQ214UmVjdGFuZ2xlJTIwd2lkdGglM0QlMjI1MCUyMiUyMGhlaWdo\ndCUzRCUyMjQwJTIyJTIwYXMlM0QlMjJhbHRlcm5hdGVCb3VuZHMlMjIlMjAlMkYlM0UlMEElMjAl\nMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0MlMkZteEdlb21ldHJ5JTNFJTBBJTIwJTIwJTIw\nJTIwJTIwJTIwJTIwJTIwJTNDJTJGbXhDZWxsJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIw\nJTNDbXhDZWxsJTIwaWQlM0QlMjJjd09HY045b2VFblBjaHdBMEdFWS0zOSUyMiUyMHZhbHVlJTNE\nJTIyJTI2bHQlM0Jmb250JTIwc3R5bGUlM0QlMjZxdW90JTNCZm9udC1zaXplJTNBJTIwMTJweCUz\nQiUyNnF1b3QlM0IlMjZndCUzQmNvbHVtbnMlMjBmb3IlMjBzeW50YWN0aWMlMjBhbmQlMjBTZW1h\nbnRpYyUyMG1ldHJpY3MlMjZsdCUzQiUyRmZvbnQlMjZndCUzQiUyMiUyMHN0eWxlJTNEJTIyc2hh\ncGUlM0RwYXJ0aWFsUmVjdGFuZ2xlJTNCaHRtbCUzRDElM0J3aGl0ZVNwYWNlJTNEd3JhcCUzQmNv\nbm5lY3RhYmxlJTNEMCUzQnN0cm9rZUNvbG9yJTNEaW5oZXJpdCUzQm92ZXJmbG93JTNEaGlkZGVu\nJTNCZmlsbENvbG9yJTNEbm9uZSUzQnRvcCUzRDAlM0JsZWZ0JTNEMCUzQmJvdHRvbSUzRDAlM0Jy\naWdodCUzRDAlM0Jwb2ludGVyRXZlbnRzJTNEMSUzQmZvbnRTaXplJTNEMTYlM0IlMjIlMjBwYXJl\nbnQlM0QlMjJjd09HY045b2VFblBjaHdBMEdFWS0zNiUyMiUyMHZlcnRleCUzRCUyMjElMjIlM0Ul\nMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteEdlb21ldHJ5JTIweCUzRCUyMjcw\nJTIyJTIwd2lkdGglM0QlMjIxMjAlMjIlMjBoZWlnaHQlM0QlMjI0MCUyMiUyMGFzJTNEJTIyZ2Vv\nbWV0cnklMjIlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteFJl\nY3RhbmdsZSUyMHdpZHRoJTNEJTIyMTIwJTIyJTIwaGVpZ2h0JTNEJTIyNDAlMjIlMjBhcyUzRCUy\nMmFsdGVybmF0ZUJvdW5kcyUyMiUyMCUyRiUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUy\nMCUyMCUzQyUyRm14R2VvbWV0cnklM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0MlMkZt\neENlbGwlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteENlbGwlMjBpZCUzRCUyMmN3\nT0djTjlvZUVuUGNod0EwR0VZLTQwJTIyJTIwdmFsdWUlM0QlMjIlMjIlMjBzdHlsZSUzRCUyMnNo\nYXBlJTNEdGFibGVSb3clM0Job3Jpem9udGFsJTNEMCUzQnN0YXJ0U2l6ZSUzRDAlM0Jzd2ltbGFu\nZUhlYWQlM0QwJTNCc3dpbWxhbmVCb2R5JTNEMCUzQnN0cm9rZUNvbG9yJTNEaW5oZXJpdCUzQnRv\ncCUzRDAlM0JsZWZ0JTNEMCUzQmJvdHRvbSUzRDAlM0JyaWdodCUzRDAlM0Jjb2xsYXBzaWJsZSUz\nRDAlM0Jkcm9wVGFyZ2V0JTNEMCUzQmZpbGxDb2xvciUzRG5vbmUlM0Jwb2ludHMlM0QlNUIlNUIw\nJTJDMC41JTVEJTJDJTVCMSUyQzAuNSU1RCU1RCUzQnBvcnRDb25zdHJhaW50JTNEZWFzdHdlc3Ql\nM0Jmb250U2l6ZSUzRDE2JTNCJTIyJTIwcGFyZW50JTNEJTIyY3dPR2NOOW9lRW5QY2h3QTBHRVkt\nMzUlMjIlMjB2ZXJ0ZXglM0QlMjIxJTIyJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIw\nJTIwJTNDbXhHZW9tZXRyeSUyMHklM0QlMjI0MCUyMiUyMHdpZHRoJTNEJTIyMTkwJTIyJTIwaGVp\nZ2h0JTNEJTIyMjAlMjIlMjBhcyUzRCUyMmdlb21ldHJ5JTIyJTIwJTJGJTNFJTBBJTIwJTIwJTIw\nJTIwJTIwJTIwJTIwJTIwJTNDJTJGbXhDZWxsJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIw\nJTNDbXhDZWxsJTIwaWQlM0QlMjJjd09HY045b2VFblBjaHdBMEdFWS00MSUyMiUyMHZhbHVlJTNE\nJTIyJTIyJTIwc3R5bGUlM0QlMjJzaGFwZSUzRHBhcnRpYWxSZWN0YW5nbGUlM0JodG1sJTNEMSUz\nQndoaXRlU3BhY2UlM0R3cmFwJTNCY29ubmVjdGFibGUlM0QwJTNCc3Ryb2tlQ29sb3IlM0Rpbmhl\ncml0JTNCb3ZlcmZsb3clM0RoaWRkZW4lM0JmaWxsQ29sb3IlM0Rub25lJTNCdG9wJTNEMCUzQmxl\nZnQlM0QwJTNCYm90dG9tJTNEMCUzQnJpZ2h0JTNEMCUzQnBvaW50ZXJFdmVudHMlM0QxJTNCZm9u\ndFNpemUlM0QxNiUzQiUyMiUyMHBhcmVudCUzRCUyMmN3T0djTjlvZUVuUGNod0EwR0VZLTQwJTIy\nJTIwdmVydGV4JTNEJTIyMSUyMiUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUz\nQ214R2VvbWV0cnklMjB3aWR0aCUzRCUyMjIwJTIyJTIwaGVpZ2h0JTNEJTIyMjAlMjIlMjBhcyUz\nRCUyMmdlb21ldHJ5JTIyJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIw\nJTNDbXhSZWN0YW5nbGUlMjB3aWR0aCUzRCUyMjIwJTIyJTIwaGVpZ2h0JTNEJTIyMjAlMjIlMjBh\ncyUzRCUyMmFsdGVybmF0ZUJvdW5kcyUyMiUyMCUyRiUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUy\nMCUyMCUyMCUyMCUzQyUyRm14R2VvbWV0cnklM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAl\nM0MlMkZteENlbGwlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteENlbGwlMjBpZCUz\nRCUyMmN3T0djTjlvZUVuUGNod0EwR0VZLTQyJTIyJTIwdmFsdWUlM0QlMjIlMjIlMjBzdHlsZSUz\nRCUyMnNoYXBlJTNEcGFydGlhbFJlY3RhbmdsZSUzQmh0bWwlM0QxJTNCd2hpdGVTcGFjZSUzRHdy\nYXAlM0Jjb25uZWN0YWJsZSUzRDAlM0JzdHJva2VDb2xvciUzRGluaGVyaXQlM0JvdmVyZmxvdyUz\nRGhpZGRlbiUzQmZpbGxDb2xvciUzRG5vbmUlM0J0b3AlM0QwJTNCbGVmdCUzRDAlM0Jib3R0b20l\nM0QwJTNCcmlnaHQlM0QwJTNCcG9pbnRlckV2ZW50cyUzRDElM0Jmb250U2l6ZSUzRDE2JTNCJTIy\nJTIwcGFyZW50JTNEJTIyY3dPR2NOOW9lRW5QY2h3QTBHRVktNDAlMjIlMjB2ZXJ0ZXglM0QlMjIx\nJTIyJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDbXhHZW9tZXRyeSUyMHgl\nM0QlMjIyMCUyMiUyMHdpZHRoJTNEJTIyNTAlMjIlMjBoZWlnaHQlM0QlMjIyMCUyMiUyMGFzJTNE\nJTIyZ2VvbWV0cnklMjIlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAl\nM0NteFJlY3RhbmdsZSUyMHdpZHRoJTNEJTIyNTAlMjIlMjBoZWlnaHQlM0QlMjIyMCUyMiUyMGFz\nJTNEJTIyYWx0ZXJuYXRlQm91bmRzJTIyJTIwJTJGJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIw\nJTIwJTIwJTIwJTNDJTJGbXhHZW9tZXRyeSUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUz\nQyUyRm14Q2VsbCUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQ214Q2VsbCUyMGlkJTNE\nJTIyY3dPR2NOOW9lRW5QY2h3QTBHRVktNDMlMjIlMjB2YWx1ZSUzRCUyMiUyMiUyMHN0eWxlJTNE\nJTIyc2hhcGUlM0RwYXJ0aWFsUmVjdGFuZ2xlJTNCaHRtbCUzRDElM0J3aGl0ZVNwYWNlJTNEd3Jh\ncCUzQmNvbm5lY3RhYmxlJTNEMCUzQnN0cm9rZUNvbG9yJTNEaW5oZXJpdCUzQm92ZXJmbG93JTNE\naGlkZGVuJTNCZmlsbENvbG9yJTNEbm9uZSUzQnRvcCUzRDAlM0JsZWZ0JTNEMCUzQmJvdHRvbSUz\nRDAlM0JyaWdodCUzRDAlM0Jwb2ludGVyRXZlbnRzJTNEMSUzQmZvbnRTaXplJTNEMTYlM0IlMjIl\nMjBwYXJlbnQlM0QlMjJjd09HY045b2VFblBjaHdBMEdFWS00MCUyMiUyMHZlcnRleCUzRCUyMjEl\nMjIlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteEdlb21ldHJ5JTIweCUz\nRCUyMjcwJTIyJTIwd2lkdGglM0QlMjIxMjAlMjIlMjBoZWlnaHQlM0QlMjIyMCUyMiUyMGFzJTNE\nJTIyZ2VvbWV0cnklMjIlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAl\nM0NteFJlY3RhbmdsZSUyMHdpZHRoJTNEJTIyMTIwJTIyJTIwaGVpZ2h0JTNEJTIyMjAlMjIlMjBh\ncyUzRCUyMmFsdGVybmF0ZUJvdW5kcyUyMiUyMCUyRiUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUy\nMCUyMCUyMCUyMCUzQyUyRm14R2VvbWV0cnklM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAl\nM0MlMkZteENlbGwlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteENlbGwlMjBpZCUz\nRCUyMmN3T0djTjlvZUVuUGNod0EwR0VZLTQ0JTIyJTIwdmFsdWUlM0QlMjIlMjIlMjBzdHlsZSUz\nRCUyMnNoYXBlJTNEdGFibGVSb3clM0Job3Jpem9udGFsJTNEMCUzQnN0YXJ0U2l6ZSUzRDAlM0Jz\nd2ltbGFuZUhlYWQlM0QwJTNCc3dpbWxhbmVCb2R5JTNEMCUzQnN0cm9rZUNvbG9yJTNEaW5oZXJp\ndCUzQnRvcCUzRDAlM0JsZWZ0JTNEMCUzQmJvdHRvbSUzRDAlM0JyaWdodCUzRDAlM0Jjb2xsYXBz\naWJsZSUzRDAlM0Jkcm9wVGFyZ2V0JTNEMCUzQmZpbGxDb2xvciUzRG5vbmUlM0Jwb2ludHMlM0Ql\nNUIlNUIwJTJDMC41JTVEJTJDJTVCMSUyQzAuNSU1RCU1RCUzQnBvcnRDb25zdHJhaW50JTNEZWFz\ndHdlc3QlM0Jmb250U2l6ZSUzRDE2JTNCJTIyJTIwcGFyZW50JTNEJTIyY3dPR2NOOW9lRW5QY2h3\nQTBHRVktMzUlMjIlMjB2ZXJ0ZXglM0QlMjIxJTIyJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIw\nJTIwJTIwJTIwJTNDbXhHZW9tZXRyeSUyMHklM0QlMjI2MCUyMiUyMHdpZHRoJTNEJTIyMTkwJTIy\nJTIwaGVpZ2h0JTNEJTIyMjAlMjIlMjBhcyUzRCUyMmdlb21ldHJ5JTIyJTIwJTJGJTNFJTBBJTIw\nJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDJTJGbXhDZWxsJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIw\nJTIwJTIwJTNDbXhDZWxsJTIwaWQlM0QlMjJjd09HY045b2VFblBjaHdBMEdFWS00NSUyMiUyMHZh\nbHVlJTNEJTIyJTIyJTIwc3R5bGUlM0QlMjJzaGFwZSUzRHBhcnRpYWxSZWN0YW5nbGUlM0JodG1s\nJTNEMSUzQndoaXRlU3BhY2UlM0R3cmFwJTNCY29ubmVjdGFibGUlM0QwJTNCc3Ryb2tlQ29sb3Il\nM0Rpbmhlcml0JTNCb3ZlcmZsb3clM0RoaWRkZW4lM0JmaWxsQ29sb3IlM0Rub25lJTNCdG9wJTNE\nMCUzQmxlZnQlM0QwJTNCYm90dG9tJTNEMCUzQnJpZ2h0JTNEMCUzQnBvaW50ZXJFdmVudHMlM0Qx\nJTNCZm9udFNpemUlM0QxNiUzQiUyMiUyMHBhcmVudCUzRCUyMmN3T0djTjlvZUVuUGNod0EwR0VZ\nLTQ0JTIyJTIwdmVydGV4JTNEJTIyMSUyMiUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUy\nMCUyMCUzQ214R2VvbWV0cnklMjB3aWR0aCUzRCUyMjIwJTIyJTIwaGVpZ2h0JTNEJTIyMjAlMjIl\nMjBhcyUzRCUyMmdlb21ldHJ5JTIyJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIw\nJTIwJTIwJTNDbXhSZWN0YW5nbGUlMjB3aWR0aCUzRCUyMjIwJTIyJTIwaGVpZ2h0JTNEJTIyMjAl\nMjIlMjBhcyUzRCUyMmFsdGVybmF0ZUJvdW5kcyUyMiUyMCUyRiUzRSUwQSUyMCUyMCUyMCUyMCUy\nMCUyMCUyMCUyMCUyMCUyMCUzQyUyRm14R2VvbWV0cnklM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAl\nMjAlMjAlM0MlMkZteENlbGwlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteENlbGwl\nMjBpZCUzRCUyMmN3T0djTjlvZUVuUGNod0EwR0VZLTQ2JTIyJTIwdmFsdWUlM0QlMjIlMjIlMjBz\ndHlsZSUzRCUyMnNoYXBlJTNEcGFydGlhbFJlY3RhbmdsZSUzQmh0bWwlM0QxJTNCd2hpdGVTcGFj\nZSUzRHdyYXAlM0Jjb25uZWN0YWJsZSUzRDAlM0JzdHJva2VDb2xvciUzRGluaGVyaXQlM0JvdmVy\nZmxvdyUzRGhpZGRlbiUzQmZpbGxDb2xvciUzRG5vbmUlM0J0b3AlM0QwJTNCbGVmdCUzRDAlM0Ji\nb3R0b20lM0QwJTNCcmlnaHQlM0QwJTNCcG9pbnRlckV2ZW50cyUzRDElM0Jmb250U2l6ZSUzRDE2\nJTNCJTIyJTIwcGFyZW50JTNEJTIyY3dPR2NOOW9lRW5QY2h3QTBHRVktNDQlMjIlMjB2ZXJ0ZXgl\nM0QlMjIxJTIyJTNFJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTNDbXhHZW9tZXRy\neSUyMHglM0QlMjIyMCUyMiUyMHdpZHRoJTNEJTIyNTAlMjIlMjBoZWlnaHQlM0QlMjIyMCUyMiUy\nMGFzJTNEJTIyZ2VvbWV0cnklMjIlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAl\nMjAlMjAlM0NteFJlY3RhbmdsZSUyMHdpZHRoJTNEJTIyNTAlMjIlMjBoZWlnaHQlM0QlMjIyMCUy\nMiUyMGFzJTNEJTIyYWx0ZXJuYXRlQm91bmRzJTIyJTIwJTJGJTNFJTBBJTIwJTIwJTIwJTIwJTIw\nJTIwJTIwJTIwJTIwJTIwJTNDJTJGbXhHZW9tZXRyeSUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUy\nMCUyMCUzQyUyRm14Q2VsbCUzRSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUzQ214Q2VsbCUy\nMGlkJTNEJTIyY3dPR2NOOW9lRW5QY2h3QTBHRVktNDclMjIlMjB2YWx1ZSUzRCUyMiUyMiUyMHN0\neWxlJTNEJTIyc2hhcGUlM0RwYXJ0aWFsUmVjdGFuZ2xlJTNCaHRtbCUzRDElM0J3aGl0ZVNwYWNl\nJTNEd3JhcCUzQmNvbm5lY3RhYmxlJTNEMCUzQnN0cm9rZUNvbG9yJTNEaW5oZXJpdCUzQm92ZXJm\nbG93JTNEaGlkZGVuJTNCZmlsbENvbG9yJTNEbm9uZSUzQnRvcCUzRDAlM0JsZWZ0JTNEMCUzQmJv\ndHRvbSUzRDAlM0JyaWdodCUzRDAlM0Jwb2ludGVyRXZlbnRzJTNEMSUzQmZvbnRTaXplJTNEMTYl\nM0IlMjIlMjBwYXJlbnQlM0QlMjJjd09HY045b2VFblBjaHdBMEdFWS00NCUyMiUyMHZlcnRleCUz\nRCUyMjElMjIlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlM0NteEdlb21ldHJ5\nJTIweCUzRCUyMjcwJTIyJTIwd2lkdGglM0QlMjIxMjAlMjIlMjBoZWlnaHQlM0QlMjIyMCUyMiUy\nMGFzJTNEJTIyZ2VvbWV0cnklMjIlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAl\nMjAlMjAlM0NteFJlY3RhbmdsZSUyMHdpZHRoJTNEJTIyMTIwJTIyJTIwaGVpZ2h0JTNEJTIyMjAl\nMjIlMjBhcyUzRCUyMmFsdGVybmF0ZUJvdW5kcyUyMiUyMCUyRiUzRSUwQSUyMCUyMCUyMCUyMCUy\nMCUyMCUyMCUyMCUyMCUyMCUzQyUyRm14R2VvbWV0cnklM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAl\nMjAlMjAlM0MlMkZteENlbGwlM0UlMEElMjAlMjAlMjAlMjAlMjAlMjAlM0MlMkZyb290JTNFJTBB\nJTIwJTIwJTIwJTIwJTNDJTJGbXhHcmFwaE1vZGVsJTNFJTBBJTIwJTIwJTNDJTJGZGlhZ3JhbSUz\nRSUwQSUzQyUyRm14ZmlsZSUzRSUwQan/rH8AACAASURBVHhe7Z0HuBQ1+7cjYseKFSv2gtiVoq+K\nFcXeG/aGYseGvXcRUcH+2rFgVxBFAcXeUREbdkVUUOyF77rz/3LeYZnZnexmzs7u/nJdXMA5mUxy\nJ5Pkl+dJMt2UKVOmGAUREAEREAEREAEREAERiBCYbrrpxEMEGpLAdBJIDVnvKrQIiIAIiIAIiIAI\nFCWAQAq5jh46vWpVX72Uo1r88v5e6lcCKe+1pPyJgAiIgAiIgAiIQBUIhBYCodOrAhL7ynopR7X4\n5f29Ekh5ryHlTwREQAREQAREQASqRCC0EAidXpWwSCBVC3wzvVcCqZlA6zUiIAIiIAIiIAIiUGsE\nQgua0OlVi2e9lKNa/PL+XgmkvNeQ8icCIiACIiACIiACVSIQWgiETq9KWGRBqhb4ZnqvBFIzgdZr\nREAEREAEREAERKDWCIQWNKHTqxbPeilHtfjl/b0SSHmvIeVPBERABERABERABKpEILQQCJ1eGiyf\nf/65mX/++c1MM82UJnqqONUoR6qMVRjp559/NrPMMotp2bJlhSnV9uMSSLVdf8q9CIiACIiACIiA\nCGRGILQQSJPexIkTzdxzz22ef/5506FDh6ay9enTx1x33XXmnXfeafrZ+PHjzQILLGC22mor89BD\nD03F4aqrrjJXX321effdd+3Pt9lmG3PBBReY2Wef3SyyyCKJzNIca56mHKUq5bzzzjPvvfeeufXW\nW0tFbZbfH3zwwebaa681b7/9tmnXrl2zvDPpJX/88Ye5/fbbza677mpmnXVW77xEn+fh2WabzXzw\nwQdm6aWXTpWWBFIqTIokAiIgAiIgAiIgAo1HIIQQiFJLk16SQLr88svN9ddfP5VAQgCdeOKJBssH\nYmm++eazr3v22WfNeuutZ5566inzn//8x076Tz31VBsH4fXtt9/aeG+++abZYost7N/zzjuv/Vmb\nNm1KVnSacpRK5Nxzz7XiDSFQ7fDXX3+ZGWec0Tz33HOmU6dO1c6O+eGHH0zr1q3Nl19+mao+CjMc\nfX7BBRe07WHNNddMLbYkkKreBJQBERABERABERABEcgngRBCIEuBtMYaa5gDDjjAnHHGGebss882\nBx10kH3dzTffbPbdd1870cYaRRgzZoy5++67zWmnndaUpVdffdVOnKPiKk1NxHG59957Te/evQ0C\nb5dddjEXXXSRmXnmma1l66STTjK4+nXp0sX069fPWrCcQMJqs+GGG5oHHnjAioHJkyfbeI8++qj5\n5JNPzDnnnGMoJ+JwrbXWMkcddZQ5+eSTrUXkiCOOsO986aWXzKWXXmpWWmkla2XDpRALGha4H3/8\n0fK56667rAjEiobFLRq23357c//995vVV1/dDBw40CCYevToYeCz/PLLG8Rp586dzeuvv27zvc46\n65jbbrvNCstoII2+ffva57bddltbVvLz8ccf2/wQ3nrrLVtnDz/8sM0HlqvLLrvM/P7775YTv4PH\nM888Y9q3b2+efPJJyw6L28iRIy2DE044wQpgQhz3rl27Nj3/2GOPmd12283ccccdlntSPRW2U10U\nm+ZLUBwREAEREAEREAERaDACeRZIuKetuOKKVtxccskl1vqBpYCAeFh22WXtnz322MNakTp27DjN\nPqRQAmn06NFm5ZVXtuJgmWWWMUceeaS1WCE4EC2IGtzFEHEIKCb6TPixIGEFm2uuuayIaNu2rf09\nog7rCb/fZJNNzA477GBF1+GHH27Li7shgXQRD4g/4m233XZW2CCIWrRoYUaMGGHZIDIQNoMHDzan\nnHKKFSPRPVmjRo2yAggRtemmm1phQnkQYkOGDDFXXHGF+fDDD837779vWcK1V69eVsy4MGHCBGvB\nQ4SS/2OPPdbsvffe1l1vs802Mz/99JN1bzzrrLPME088Yf/g+kZaV155pX3uhhtuML/99pvNN88M\nGjTIbLnlljYvG2ywgTnwwANtHh9//HEDc9wt47gjEN3ziC3yM3bsWIPrXVz8Pffcc6ovu2ILEgko\niAAE0vjsipQIiIAIiIAIiEDtEMizQMIS9MYbb1gLzQsvvGAFEBaXJZZYwgJm8nzjjTfaifcXX3xh\nJ+dYNLp3795UAaEEEpYPBBoTewKC5LPPPrOi4umnn7ZufQSEzAorrGA+/fRTu/corUD65ZdfrHvY\n/vvvb0UKQvDff/81008/vRk+fLj5888/rUDC1bBVq1ZWCO28885WlBx22GGWz3//+18r1sgnwi26\ntwfhgLXro48+skICC4wTNOR7jjnmMAMGDLAWGARS3H4eyrv44otbsbPPPvtY6x1iZ8kll7Tucrwf\nAUceyBNxEEjDhg2zFiPyznsQvgicqIsdoghXSNrjgw8+aPbaay+b9plnnhnLHeuVex7xyXsoF+0h\nrp7222+/8AKpkSbGoTuK5u4is8p/Vuk2Nx+9TwREQAREQARE4H8EQo/vadL7+++/zQwzzGBFxkYb\nbdSUGSwu7Cnizz///GPd0ZgkY7VAICCCLr74YnPcccdZNzVOYmPS78QSlgvEEvuPmIATQgkkrDuk\niTiIBn6+8MILWxcyAvlFmLzyyitWxMQJJGeJcRYkLGBuz1TPnj1tuXB5c8IF1zzEUjSeswgxR4cL\n1iuEAWLl6KOPtpaoaIgKpKFDh1pLWPQwjHXXXdfsuOOO1tUPiw7iKS7g5gd/AvHIJxY1rFqUHesV\nByVQNidcomKL9oHoXXTRRacSSFjZEENYzygD1jbSw0IVxz26BykqkHh/XPzCsgSxIEkg1U5XmqZj\nKqc0WaVbTl70jAiIgAiIgAiIQBgCocf3tOmtssoq1tKDm5YLCAAmt0y62Z+C1QH3L3ckNXtMsNQw\nscfCwKEDuGy5MG7cOOvChjhhoh9SICG+XnvtNbuPiPDyyy9bYUJemPCz74XAYRG4r+HihutbVCA5\nocA+Hyw8TiCxlwo3OgICCfHoBBcWFyeQovGiAglXNMr9/fffW+sL+5Z4L5YsF6ICiXx369bN7l3C\nQsU8f84557SCDmGaJJBIn2cQN7z/9NNPt//mEAqsRFh1+NkjjzxirWq//vqrtew410LyEieQEEKI\nKixQWMWwBNE++Dn7vOK4r7/++rEWpDvvvDM2PpataJBA8uw/0n7Ynsk2W/Ss8p9Vus0GRi8SAREQ\nAREQARGYhkDo8T1teggBJv5ssGdyzJ4T9uEwwcUawt4X9uo44UHGmSgjfDgEgEMLiIO1ib0rHDpw\n4YUX2gl61HUslAUJ0cXBCuSTQw3Yx0MZEEO8n3JghcEVD9GEW6A7pIHDDhAg7PfhGfYvIexCCSSE\nCflgTxRubLiewWq11VaLFUjOeoM1Dpc+LEpYwrDKIECTBBJpUlYEIVYeWMMXQYRVcJ555rHv51AK\n9hIVE0i46rFvCHGHeyGHQsCDvMOIgx/4OUIvjjtlds8jDp2L3aRJk2Lj4+4ngVRBB5j2w67gFZk+\nmlX+s0o3UxhKXAREQAREQAREoCiB0ON72vSYPGMt4CQ3FzjO+/zzz2+aWN9zzz3W7csFLB2LLbaY\nndQjBrCU4JrlAlYZ3LSwjrgQSiDh4sZ7OT2PwOlsuPNxEMLuu+9uDxZgDxSXsHJ629prr20PaXD3\nIGGBQlAQsH6xPymtQMIqhGUnyYKEtQYRwfsJcOVEvGjARZG8OmsOJ87hhscziBoEG/t02PvEXqAk\nFzvSpl6w9OHadtNNNzUdG0594IKICyFCJ0kgcTIegg5BiVsgFi1OunNtgYMpsBZuvPHGllMcd1wr\n3fPsA0NkY6FDuCXFl0CqoDNM+2FX8IpMH80q/1mlmykMJS4CIiACIiACIpBLgeQyxeQc9zImuLjM\n+QZcvnCt4+AGd9y3bxpx8ZPmPbiZ4ZaGOIiGr776ygqNpZZaqsklsDBdrBsErEmhA4IGDrwfa0qa\ngIUOcYKo8LmslQMoEFzsPYoGBC4cbrnlljSvt3GoP1dvtAOsUOQfF0X+OM5J3KPPR1+aFN/FkYtd\n6ir6v4i1LgSyyn9W6XpWj6KLgAiIgAiIgAgEJBB6fA+dXsCieiVVL+XwKnSZkRFa7BXC+lfo2ldm\nkpk/JoHkiTjpg8BMiGLmJBV3E7Nn0s0SPasPOqt0mwWKXiICIiACIiACIhBLIPT4Hjq9alVbvZSj\nOfhhveJgDdwOcXOshRBcIC2wwAL2Jt/CC5dqAUaaPBZ+EJjuDjnkEOtn6gIXU3HOOsdPlhM4SYQT\nP9iE6GPSjL4LscYGxZ122mmqLGT1QWeVbjn89IwIiIAIiIAIiEAYAqHH99DphSmlfyr1Ug7/kjfG\nE5kIJI4tZINZPYboB8FmQI4RZGMe/pT4aHJ5F5u/2CznLgXz5RA9u71ckcUGQDaxceFXNGT1QWeV\nri87xRcBERABERABEQhHIPT4Hjq9cCX1S6leyuFX6saJ3awCiVM2sC4hIjj+8IILLrCXZ3EzLyd1\ncPvxE088YY9o5Kxz3NU4CpFTL7755htrleJseNJgoxbPc0wigVMuMN+54x7xc+TYRI4V5DhCjipk\n01xSehxpiMWFy63IH1Yg0uOUjSSBwc3FHGfIrcjLLbdcUzTeQZnIA6do8Df/Z8MaZ/hzrCPguYmY\nUzk4y56NZhz9yJGUnOvP+f6c4MElaYgtjnzkZ5z7zjGVHM1IfmHBjcQcd8iJIZw6ghWPkz04iYQj\nMaNHYGb1QWeVbuN8iiqpCIiACIiACOSPQOjxPXR61SJWL+WoFr+8v7fZBBLiABFy2mmn2bPMOeud\niT0ihGMNOd6QoxdXXHFFc9hhh9k/xEUQ4K+49dZb2w1eL7zwgj0SETF10EEHNV2cxdF/CCksJlzC\nhUDgnHVOvsDNrH///oZLoJLSI96CCy5oz1bnyEjehc/kiBEjEgUSaZJ3d7txXGUjXLjkDJG08sor\nW4HEWeuIIU7hWHbZZe2Rh7jocYQil17xTgTaoEGD7FGK6623nj2l4/jjj7fn+SMMce3jzP1NNtnE\nlosbis844wzTokULe5HXFVdcYfr06WPP44/6e2b1QWeVbt4/IOVPBERABERABOqZQNz4zuJzv379\n7NwM75mBAwfabQXMV3Dxv//+++28JO7UuXqZL9RLOeq57VZStmYTSLiNYQHZfvvtzXfffWcFEdYg\nzlNHZHBGuxMjiCc+MCwlCCbOS+cjw2KCuColkPg9AoK4HE+IQMKKhOtbUnqICqwxn332mbXuYEXi\noq+vv/7aCicXoh8Eoofz7ombFDjnHrFz9tln2yicO48Q5BkEEjcLYzHi6EduQybvWIKwXGFV4/1r\nrrmmFYQcUYlbH/nhXH+OgUQg8WyrVq2sMOL8eYSdXOwq+Sz0rAiIgAiIgAiIAATihACLueyR5vLT\nrl272vtm+MMCLgu5LFJzclnccdX1IizqpRxq5fEEmk0gcfDAOeecYy0bTOgXWWQRwy25TiAhTLhM\nioAwevHFF61lBCsIl3gRWLGYYYYZYgUS7nvHHHOMtSDhfte7d297eZS7FAsrDqscSenhmsZqSGHA\nGtWuXbtYgcSlX7vttpvhLHXOZXcBgYL1igu1EDXkY5tttrG/dh0Ht/8ikLi0irP9XSfE7cPuBmME\nEq6DiJ7CcM0119jnsEg5C9aoUaNM586drYiSQNInLwIiIAIiIAIiUCmB0EIgdHqVlq/c5+ulHOWW\nv96fazaBhLkV6xFuYauuuqrdk8MtuE4gIQbcrb5OIO2yyy7mwAMPtNYg9g9hTcGE6yxIuKo5cYCb\nGuIHgXT00UdbVzwEDCIMgYE7HxahpPQ4NW7IkCHWykX466+/zOjRo81aa601lYk4+kFw0y9lwRUO\nMecCBzZgJUL8bLTRRtZlD5dBAsIGMcczCCR3Y3GSQCIPWKDYg+XE3rvvvmvLxfuTbk2WQKr3T1fl\nEwEREAEREIHsCYQWAqHTy55A/BtCl4MDv/jTsmXLxCJhAMDLaKaZZqpWsRvmvZkIpMMPP7zJYgJJ\nhACigD02TOpxt8Mki9scJ73hYhcnkNgHxC28iCnEEvt2EBgIJPY0sZ+IgxJwm+M0OQ46QCB169bN\ntG3b1u7t4UIqDlLgRLfu3bsnpocFCyFCeogiDnXAqkS+oo218INAfGHhuvPOO+2hCi+//LJ158Ol\njoMVsJghvhBrWM0wOyMU2T+VJJAQP+yJQhxx4AKHVZAf9lgNHTrUlo/fIRiTBBJ7srBglco/9cM+\nqVNOOcUejLH55pvbxo8VCwsVro9JcaJfSeiOomG+QBVUBERABERABHJMIPT4Xkl6zpOo8HCsauCr\npBxx+WVfOweVsYBeGJiPsrWCBXICXkm4M7LwX60QvU6GA8WiHk3Nkaek62x4d4j8ZCKQxo8fPxUb\n9smwV4fNe+w/ImBRoXLdCXaFAgmhwelrCAv2+rz11lvWEsRhBh9++KG1oHCAAYIBy0qXLl3s4QwI\nJBoYVhsC4oJ4uM/hvoa4iktvqaWWsocpcKIcAYWO6CHdYkIACxZWKSw2LiBkKNsss8xiLT8IL/JP\nQHw98sgjds9QnEBCQCK08OXFvQ4uI0eOtHciuXDeeedZsUgDSBJIX331lT3pjsMdonuk4j5oRBwC\nEhHLhksC8cgH+SHExSnGpTk+Dr1DBERABERABEQgWwKhhUAl6UW3WlRTHLh5ElsaQoUkgYSnFfNn\ntmhw+jFz3VNPPdUw12ZvfrVC1FMJTy/mmh06dGi27CR5SpGBEPkJLpCKkaEhIWDYl4NVZtKkSdaK\nhJCICxSQQwy23XZbe9Q1YmSxxRaz7m/OqoMAYf9P4Ukp7HlCJPAuCokw428sRMXSIx7vwXrCEd2F\nIenDZh8SecFyVXi5K3nnkAXyiGWGNNIEyo8licCeJdLgWHT3s1Jp0JGwkdK55mXxQbs8VNLhlSqH\nfi8CIiACIiACIlAdAqHH92h6zFPYn4530LzzzmsOPfRQgxcS86a4K1LYh+72ouONlHTdC4vvY8eO\ntXM/PGFYYO/YsaPdn05gMRxvn2LXzLA1pG/fvtZLiHkjC+0cphU378FbicVrFrRZCD/hhBOsqEm6\njgUhgTsd3kZsN2HRn/kd8+JCCxIGBhbDKa+b/2FBw2DAac8E4nAq9OTJk+2VOAgo2CKosOzceOON\ndi5IfPb7M7fGAMDPmYP75r/wOpkTTzzRlp+8J9VptPXChXrnWh+218CMhXoOaWN7CoYGV1dxV/Cw\n1z96nQ1xuUKHw9m44gePtVL5YY7N6c8YYmh7tKWtttpqqvqdbkoFEjj0hxMFCGT2DnESHIcPUJEc\nlU3Bywkh0suyvOWUyfeZrPKfVbq+5VN8ERABERABERCBcARCj+/R9BAovXr1sh47TFj32msvuzcb\nD5m4K1LYkpF0WFf0uhd3fQyCgO0NbGvAMwhvGDyP2G/Ogn3SNTMcUY4HEiIEUUJe9t57b3sYWJxA\n4i5MtnPgUcSEmytW2AqB1SfuOhZObSYe+WLPOttGEBnsZy8USAgGroThD2IH0YPYc/uQeBeeUoi5\nFVZYwYop8sG9nngqsQ0FIYCLHuXmd4hDRB95YAuLb/4Lr5NBvDkXu6Q6xYDgAvULF+7q5P2IYixi\nnCVAQCyx34q6jruCp/D9GBHgAiPaE8aRUvm57777bDtDWJEeW03YquO4NqsFKdznWr2UQncUzV2S\nrPKfVbrNzUfvEwEREAEREAER+B+B0ON7ND0WwNlX7SwhuJkxoWdyH3dFCp40aQUS4gT3NAIWD0QP\nggiLENenIEqwYsRdM4P1CqsOe9k5EAwBgEcOQiJOICE0ECmUje0KCD3i8/6k61jYF48I44oZAvvT\nCXF7kN555x1rJECwcX0N1iAsWqTB1gjEnzvojHgcdsZVOggkRAAHhnGiMvGxvvA8z8EfYVBO/qMu\nbtE9P0l1ioApFEgIG7yuuIYHFz14YVnjYDY8vrBsJV3Bw1YchBRCF8sd6buTodPkh7u7ONCN/fcY\nY9jWwr2hzgssiEBSRyICEKjAECmAIiACIiACIiACOSSQpUDCZQ2vIO6rjAZ+HndFClss0lz3ghBi\nEz+uZwSEDS5UWGjYw8MeawQMQijumhlEChNzhBoBiwsubBwcFieQsM5gCcIKwpUyWMGcQEq6joUy\nImbcnvlLLrnEcF1LoUDCbY5tJW7bB2KJu0QRS2wJwZUPd8JoQACx7SN6ncw999xjLUk8T3AnPMOq\nnPwnCaSkOo3mr/AQhZ49e9oywphAGg888IC9cDjpCh7EclQgUUeIP0I0/aT8IDTZ348wos44ARtL\nVrR+c+til8N+IvbCtDzmMylPoTu6uI6ilngoryIgAiIgAiIgAskEQs8boumx6o+bFaf+EhAH7DXH\nTSruihQOuYoKpKTrXgpPR0YgYVXBraxQIMWdoszpvrj8sW8c0UJ6/JtTiQvnPVgw2LeOJQLRgVjh\nlGUnkJIO0+IwLoShKzvudhMmTJhGIGGNYg97dHvJuHHj7J53rs5hUs9Jzs4KR74RR7jbRQ8DSxJI\n7FkqJ/9JAimpTrHauVB4yBgCiXp1B6U5gYR1L+kKHn6eRiAl5QfBCUPOEMDqx74nTgmEGyGIBamR\nLAehO4rm7pSzyn9W6TY3H71PBERABERABETgfwRCj+/R9LAYsKme/SC4W7Vr184KjMceeyz2ihTE\ngBNIxa57qVQg4XbGniJOP8a64FzzEGiFAolDBDgcAKHVunVrw2QfgUV5EFdJAsmVHVcvDhbj0AT2\n2xdakBBGBxxwgN3PRJ6woiH2yBMWE67EwZoEMyxHCC0OXuC5NAKJa3fKyX/0Ohn2NjlLWVKdRq1v\naQUSZwckXcGDQHLX2XBlUJIFKSk/iHAsiRxo8fPPP9u6w6WPE6AlkMroAUN3FGVkoaJHssp/VulW\nVFg9LAIiIAIiIAIiUBGB0ON7ND1c0pigMzFlco81hZPdkq5I4dRiBBKnuHE9S9J1L3ECCSHB/UHO\ngsSGfE46S7pmBmsQVhf293BlCqe/derUaRqBhJGA088QDASsGpxMxylriKMkgcRdlsRxdxsxwceq\nwRU30cCJflg3cINzgb0yuPSxf2vixInWCsfJdAROhCMNTmYrJZAQpAiEcvIfvU6Ge5qcQEqq02iZ\n0ggkrDobbrhh4hU80fcjBmkLcS52SfnBOoUodSc9U99uH5cEUhldRuiOoowsVPRIVvnPKt2KCquH\nRUAEREAEREAEKiIQenyPS4/9IAiR6JUtaa9ISbrupaJC//+H2ceEpSpq/XDpFpaDU9cQcIgSxBd/\nEFbFAhYSXPTatGkz1ZUscc/gOodrHSe0FV73gkgjr1iiOMmNvPmGcvNfeJ2Me29cnfrmycVPuoIn\n7jqbpHfE5QdBBVPENvUWDXKx86yt0B2F5+srjp5V/rNKt+ICKwEREAEREAEREIGyCYQe30OnV3bB\nKnywXspRIYa6fVwCybNqa/2DyCr/WaXrWT2KLgIiIAIiIAIiEJBA6PE9dHoBi+qVVL2Uw6vQDRRZ\nAsmzsmv9g8gq/1ml61k9ii4CIiACIiACIhCQQOjxPXR6AYvqlVS9lMOr0A0UWQLJs7Jr/YPIKv9Z\npetZPYouAiIgAiIgAiIQkEA5+1kCvl5JiUDVCOgeJA/0tS4Essp/Vul6VI2iioAIiIAIiIAIBCYQ\nenwPnV7g4qZOrl7KkbrADRZRFiTPCq/1DyKr/GeVrmf1KLoIiIAIiIAIiEBAAqHH99DpBSyqV1L1\nUg6vQjdQZAkkz8qu9Q8iq/xnla5n9Si6CIiACIiACIhAQAKhx/fQ6QUsqldS9VIOr0I3UGQJJM/K\nrvUPIqv8Z5WuZ/UougiIgAiIgAiIQEACocf30OkFLKpXUvVSDq9CN1BkCSTPyq71DyKr/GeVrmf1\nKLoIiIAIiIAIiEBAAqHH99DpBSyqV1L1Ug6vQjdQZAkkz8qu9Q8iq/xnla5n9Si6CIiACIiACIhA\nQAKhx/fQ6QUsqldS9VIOr0I3UGQJJM/KrvUPIqv8Z5WuZ/UougiIgAiIgAiIQEACocf30OkFLKpX\nUvVSDq9CN1BkCSTPyq71DyKr/GeVrmf1KLoIiIAIiIAIiEBAAqHH99DpBSyqV1L1Ug6vQjdQZAkk\nz8qu9Q8iq/xnla5n9Si6CIiACIiACIhAQAKhx/fQ6QUsqldS9VIOr0I3UGQJJM/KrvUPIqv8Z5Wu\nZ/UougiIgAiIgAiIQEACocf30OkFLKpXUvVSDq9CN1BkCSTPyq71DyKr/GeVrmf1KLoIiIAIiIAI\niEBAAqHH99DpBSyqV1L1Ug6vQjdQ5CACqYF4qahFCEyZMkV8REAEREAEREAE6ohAaCEQOr1qoa6X\nclSLX97fG0QgNdLEuNY/iKzyn1W6ef+AlD8REAEREAERqGcCocf30OlVi329lKNa/PL+Xgkkzxqq\n9Q8iq/xnla5n9Si6CIiACIiACIhAQAKhx/fQ6QUsqldS9VIOr0I3UGQJJM/KrvUPIqv8Z5WuZ/Uo\nugiIgAiIgAiIQEACocf30OkFLKpXUvVSDq9CN1BkCSTPyq71DyKr/GeVrmf1KLoIiIAIiIAIiEBA\nAqHH99DpBSyqV1L1Ug6vQjdQZAkkz8qu9Q8iq/xnla5n9Si6CIiACIiACIhAQAKhx/fQ6QUsqldS\n9VIOr0I3UGQJJM/KrvUPIqv8Z5WuZ/UougiIgAiIgAiIQEACocf30OkFLKpXUvVSDq9CN1BkCSTP\nyq71DyKr/GeVrmf1KLoIiIAIiIAIiEBAAqHH99DpBSyqV1L1Ug6vQjdQZAkkz8qu9Q8iq/xnla5n\n9Si6CIiACIiACIhAQAKM7woi0IgEpptSwUVGjTYxrvXyZpX/rNJtxA9SZRYBERABERCBvBAIPb6H\nTq9anOqlHNXil/f3yoLkWUO1/kFklf+s0vWsHkUXAREQAREQAREISCD0+B46vYBF9UqqXsrhVegG\niiyB5FnZtf5BZJX/rNL1rB5FhAbShQAAIABJREFUFwEREAEREAERCEgg9PgeOr2ARfVKql7K4VXo\nBoosgeRZ2bX+QWSV/6zS9aweRRcBERABERABEQhIIPT4Hjq9gEX1SqpeyuFV6AaKLIHkWdm1/kFk\nlf+s0vWsHkUXAREQAREQAREISCD0+B46vYBF9UqqXsrhVegGiiyB5FnZtf5BZJX/rNL1rB5FFwER\nEAEREAERCEgg9PgeOr2ARfVKql7K4VXoBoosgeRZ2bX+QWSV/6zS9aweRRcBERABERABEQhIIPT4\nHjq9gEX1SqpeyuFV6AaKLIHkWdm1/kFklf+s0vWsHkUXAREQAREQAREISCD0+B46vYBF9UqqXsrh\nVegGiiyB5FnZtf5BZJX/rNL1rB5FFwEREAEREAERCEgg9PgeOr2ARfVKql7K4VXoBoosgeRZ2bX+\nQWSV/6zS9aweRRcBERABERABEQhIIPT4Hjq9gEX1SqpeyuFV6AaKLIHkWdm1/kFklf+s0vWsHkUX\nAREQAREQAREISCD0+B46vYBF9UqqXsrhVegGihxEIDUQLxW1CIEpU6aIjwiIgAiIgAiIQB0RCC0E\nQqdXLdT1Uo5q8cv7eysWSHkvoPInAiIgAiIgAiIgAiJQHoHQQiB0euWVqvKn6qUclZOozxQkkOqz\nXlUqERABERABERABEaiYQGghEDq9igtYZgL1Uo4yi1/3j0kg1X0Vq4AiIAIiIAIiIAIiUB6B0EIg\ndHrllaryp+qlHJWTqM8UJJDqs15VKhEQAREQAREQARGomEBoIRA6vYoLWGYC9VKOMotf949JINV9\nFauAIiACIiACIiACIlAegdBCIHR65ZWq8qfqpRyVk6jPFCSQ6rNeVSoREAEREAEREAERqJhAaCEQ\nOr2KC1hmAvVSjjKLX/ePSSDVfRWrgCIgAiIgAiIgAiJQHoHQQiB0euWVqvKn6qUclZOozxQkkOqz\nXlUqERABERABERABEaiYABNFBRFoRALTTdENn7mqd3VGuaoOZUYEREAEREAERCAlAU0pU4JStNwT\nkEDKWRXJbJuzClF2REAEREAEREAEShLQ/KUkIkWoIQISSDmrLHUwOasQZUcEREAEREAERKAkAc1f\nSiJShBoiIIGUs8pSB5OzClF2REAEREAEREAEShLQ/KUkIkWoIQISSDmrLHUwOasQZUcEREAEREAE\nRKAkAc1fSiJShBoiIIGUs8pSB5OzClF2REAEREAEREAEShLQ/KUkIkWoIQISSDmrLHUwOasQZUcE\nREAEREAERKAkAc1fSiJShBoiIIGUs8pSB5OzClF2REAEREAEREAEShLQ/KUkIkWoIQISSDmrLHUw\nOasQZUcEREAEREAERKAkAc1fSiJShBoiIIGUs8pSB5OzClF2REAEREAEREAEShLQ/KUkIkWoIQIS\nSDmrLHUwOasQZUcEREAEREAERKAkAc1fSiJShBoiEEQgffrpp2aJJZYwY8aMMcstt1zQ4v/xxx/m\n9ttvN7vuuquZddZZg6adx8TUweSxVpQnERABERABERCBYgQ0f1H7qCcCQQTSuHHjTNu2bTMRSD/8\n8INp3bq1+fLLL02bNm3qiX1sWdTB1H0Vq4AiIAIiIAIiUHcENH+puypt6AIFF0iTJk0yl156qVlp\npZXMddddZ+aff35z1VVXmQ4dOtif//zzz+aNN94wzz//vOnatavp27evmX766c2GG25oHnjgASuC\nJk+ebLp06WIeffRRs/POO5tnnnnGtG/f3jz55JNmvvnmq+sKUwdT19WrwomACIiACIhAXRLQ/KUu\nq7VhCxVcIH3++edmk002Mdttt53p0aOHOeOMM0yLFi3MiBEjzKGHHmr69+9v+vXrZ13xDj74YHPE\nEUeYffbZx8w111zm448/tpaoiRMnmrnnnttajUaPHm0222wzM2jQILPllluaGWecMXeVdeaZZ5r1\n11/fbLDBBhXnTR1MxQiVgAiIgAiIgAiIQDMTyGL+wgL58OHDzemnn97MpdHrGp1AZgIJS1GrVq3M\n4MGDrRXop59+sgLpk08+sT8j3HXXXWbAgAHWcpQkkGaeeebcu9ghkBCC/Kn0I/btYIivIALVJDBl\nypRqvr7ku/WNlERU8xHy2AbV7mq+WTVMAUJ9P77zlzSAEUh4GLEA/fTTT6d5RHFEIAiBTATSHnvs\nYb799lubwVGjRpnOnTsbPkAEEgct4GpHeOGFF0zHjh3N999/b0WQsyBNmDDButJhQaoVgcRHTOAj\nrkQk+XYwvvGDtBolIgL/n0AttL9ayKMaVPkE8lq/ec1X+aT1ZD0SCNlOQ6blWDO3YhGauRX/Zn4V\nwlunHutSZQpLIBOBtO+++xpc7eIE0l9//WWuv/56+zv+7t27txk7dqy1IH3wwQdm6aWXNq+//rpZ\nffXVa0ogUR4+XFY6COWudPh2ML7xwzYfpdboBGqh/dVCHhu9HVVS/rzWb17zVQlrPVt/BEK205Bp\nFQok5lTOW4d/SyTVX1vMW4maXSBxZPfbb79tZphhBut6t+SSS5r//ve/Zs455zQnn3yy6dmzpzny\nyCPNDTfcYAUSFif2I7EXiYMf8hj4aJ1A4m/+X+5Kh28H4xs/j/yUp9olUAvtrxbyWLstoPo5z2v9\n5jVf1a8x5SBPBEK205BpxQkkfuZc7kJsachTPSgv+SMQRCBF70HCclTMgjRs2DBrMSJgJXr44Yft\nyXVnnXVWk2vaXnvtZW699damo73XXXdd89xzz+X2qO9CgVTJR+zbwfjGz18TVI5qmUAttL9ayGMt\nt4Fq5z2v9ZvXfFW7vvT+fBEI2U5DppUkkNz8yrndVbKlIV81odzkjUAQgZS2UOxBwpXutNNOM999\n951ZbLHFpnqUI8IJWJMKw48//mgtSXkMcQKp3I/Yt4PxjZ9HfspT7RKohfZXC3ms3RZQ/ZzntX7z\nmq/q15hykCcCIdtpyLQco6T5Fb+vdEtDnupBeckfgaoIpPPPPz9/JCrIUbEP2Pcj9u1gfONXUEw9\nKgLTEKiF9lcLeVTTKp9AXus3r/kqn7SerEcCIdtpyLTSCCTiVLKloR7rU2UKR6BZBdLjjz9uT6Vz\nqj9cMaqbUimB5PMR+3YwvvGrS0pvrzcCtdD+aiGP9dYumrM8ea3fvOarOetG78o/gZDtNGRaaQWS\nm1+xJ0mHN+S/vdVSDptVINUSGJ+8phFIaT9i3w7GN75PuRRXBEoRqIX2Vwt5LMVZv08mkNf6zWu+\n1JZEIEogZDsNmZaPQCKuDm9Quw5NQAIpANG0AinNR+zbwfjGD1BcJSECTQRqof3VQh5ruUn98ccf\n5p9//rEnjlYj5LV+o/n6+++/TYsWLeyfckKlz5fzzrTPkLeWLVumja54FRL4+eefzSyzzBKMeajv\nJypQihVx/fXX9yLgcxhD9M4kHd7ghVmRYwjUtEBiYObY8F133TX14OzO0Q/dGnyOnCz2Eft2Vr7x\nyyn3/fffb9q3b2+WWmqpch430efhNG7cOHPzzTeXlVZeH+Ikx5deesnstNNOwbNYKf/CDIVMrzna\nX6VAayGPacuYt2+J6xiOPvpoc8kll5hnn33WLLfccuaEE06w1ziMGTPG/j/rkNf6jeaLk1gPOeQQ\ns+eee06F49dffzWzzTZb0x2ASaySns+abZr055hjDjNy5EizyiqrpInebHHKmR9EMxd9np+nqaes\nC3fwwQeba6+91l6VstFGG5l77rnHiu4ddtjBfPvtt2W9PtT34+ZWjPHFAvMf3+B7OawOb/AlrPhx\nBGpaIP3www+mdevWXsd/+1h7sm4ybnNh9FJZ387KN345ZWLgO+WUU8qe/Eef/+STTwwDz/LLL19O\nVnL7DMfVH3XUUeajjz4KnsdK+RdmKGR6zdH+igFN8z1XO48hG0TeviUmIptvvrkVRe+9955dqFp4\n4YWtQOL/zfGdV6N+fdtdksD5999/rbBcc801iy7ySSD5f0XlzA+ib4k+v+CCC6aqJ/9cpn/ir7/+\nMjPOOKO98qRTp07mhRdesN8XYmnLLbc0P/30U/rEIjFDfT9OIE2ZMqWsfIR+SIc3hCbaeOmlFkiX\nXnqpwbT7xhtvmOeff9507drV9O3b17B69Nprr5nzzjvPriKttdZadrBcb731zOuvv27OPfdcs846\n65jbbrvNvPnmm+bee++19x1xWMOOO+5onnjiCbux7qqrrjIc5c1EnNCvXz/zyy+/2LS++eYbe3ks\nKw9MEC688EKz2mqr2cMe+BnWjSeffNLMN998JWswzcBWMpGAEQpvhvbtrJLiw5KVnLvuusvMO++8\n5oILLjAMsptttpkZMGCA5UfgYt6ll17a1gcX8yJgqJM11ljDXuBLHZ9zzjlmkUUWMTfddJPZeOON\nDacQUp+4VtAxX3zxxWb66ae3HTVHub/11lv252effbZNI/o8lhbqs3fv3rHxyUuxQPvh2YkTJ5pd\ndtnFXHTRRTbvDz30kDnppJMM93B16dLFth/yTFnjykVbicsv709qb0899ZS588477UoiXFdccUXT\nv39/O2jB5eOPP7YreeSxWMB6dvnll5vJkyfbFeVTTz3VPkOe77vvPrPAAgtYxnxryyyzjP2GHH/K\nRx1RJtxaKF8l9cE3xPOk8+eff5o99tjDvo8ydezY0dYn1oEHHnjA5qUw+LbXgJ9OU1LkgZBkxa0k\nj1hs+AZeffVVs+2229o6os9LaiNYEWnvfD/XX3+97Q8RzlyC/cEHH5gjjjjCtl/Cgw8+aOhX33//\nfbPpppvabxRxkdRmC7/FSr6lpLaMxYfvPNrGrr766tg20qdPH1suLvumD3j33Xft1Q277777VAIJ\nJscdd5wtJ/0PbZ+FrcL3UO5yQiX1W8773DM+7Y6+l7ELRvyBEd8Vfegmm2xi7rjjDvuNDxo0yH5/\nuFBtv/325sUXXzQDBw60fXfc8/S7SW0xbvxNKi8ukmn7ARa4GI+/+OIL673BHYb0VeQvRF3HfXPX\nXXed7V+ZJxAYYw444AB7j+JWW21lsKxcdtll5vfff7fjAL8rnB/Qd8bNU0gvblxhjuPmF4899pjZ\nbbfdmuopaRxK4ps0f2IsKexnaRM9evSwfQ4CiO+lc+fOtj3AhvsjaRN8U8wfEEZRgRQ3vtBWk/rz\nUN9P3gQSdVE4v6rke9ezjUcgtUBiosRkkAkCgygdEoM9HSX/32CDDcyBBx5oJ46cVjd69Gi74vKf\n//zHLLvssqZXr15m6623tpM/RBArIPvss48ZP368YcXhxBNPNBMmTLCTCgL/R5DxPgQW9ycdf/zx\nhoGdiQQCgM6YAZdBhQ6CSV2pkDeBRH6jvrtM8nxWYJI6NwZfRCOD7eDBgy1zBg/qCWa8hwn67LPP\nbpkOHz7cDnTwYeJ/2GGH2T+IkC222MLsvffettP++uuvrRjAzM+9VLiU0S4YTBZffHHTrVs3W69M\nmOaff35b79Hnr7zySjv54vm4+K7+4+qRNrXyyivbCSuTddoe4oIBY6WVVrITUQZshBkCCsFOOePK\nRVuKez8DcVJ7gyODJO28e/fuVpzh3sDk5oorrjBMGGn75Ccp8Ht4UIYVVljBXqpMeuSHCQaD2DHH\nHGP/PWTIEJvHKD8mkdQldcjiAeK3kvpgwD322GPt5Ai2CCTqj0UMrAC0DwZiWMfdTxZqcC313Rb7\nfaHbbKFQKjeP9EcI6bvvvtu2dTjxHeBSltRGXnnlFTvhRSjz7Rx++OG2j6NtEGijTNT4NhZddFF7\nJxxpITRow0yGYJ/mW6zkW6KfjmvLCGH6jGgb+/DDD2PbCN8AopE/9BX02fT1/O0sSHBj9Z2FGBbE\n+GaY0I0YMWKa95R7umm59VtJm4tOvlw6xdqdu+yc75cJMLzoc7EcMUnm8nRY0d5gxHcNx99++83y\nSnqehciktsgCUHT8RTQkBfq9tP0A/Rv97/7772/7PSwaCCTqudK6Tvrm2rVrZ/s8WNAn8X2wkMcf\n+NHu+B74VnH7hBttLDo/oH+Lm6e88847seMK36h7nrZJ/VBPCMS4cajQfTLKOmn+RDuI9rO4YbK4\nQvr0CYwBMOYb5A9CiW8XUdi2bVvrYkf7dwIpaXwhraT+PNT3k0eBVDi/0r6kSnu9xnreSyAxsWWS\nSOAjpbPH+sO/mcTxobEqutdee9kO6uWXX7YdNCunrMwTj8krHRLh1ltvtRPNYgJpv/32s4MI715i\niSVsXDphVjXptGrZxS7a1Ny+JP4OIZDoeDHBY8Fh4sUgxsB24403mmuuucbWAZMh+ONKQL1Qlwwq\nBDpUVqjZ4xV168FtBnHKoMwKIgKJznnttde2bQCrH242TgQx8Y4+TwfF75icJcVP+gRZGaQcLo8I\nwM8++8wOHOQdyyaBvQ+ID/LPYBlXLgazuPcziUtqb7g4kG+E5kwzzWQnuqyYU2ZEZhoXOxYJGHid\nEKQ+sIiyuk7ZmAixkoxFgbwTovyYvCKuEKqIs0rrg3pjEkD9E8gXAx11xIDKKj+CKSmEGlwr7Xbd\nan40HTdhLTePtC0EKhMvGPCd0K+xsJDURhCRCCT3HTCJpG5ZLMKdihV/JsZM+PjWWRXm0mwmfKz4\nE49vJM23WMm35ARSXFtGpEXbWFIbof3TvyPw6fPpSwoFEuMFq+ewpB5gwao47Zf2HX1PuW2g3Pot\n933R59K2O75r9oy4BTomwUyasSQ5gYTFAMv8I488Yl8BN2chSHoeT4Cktkg/Ex1/i5U3bT/AWE4d\n02YRcbRZxhUEEm220rpO+uawUjLWM55tt912dkxjjOO7hN+wYcPsfIBFVSy89IuUPzo/SJqnwDhu\nXEH4u+dZoHX1RJ8dF5+5SlKgruPmT0OHDp2qn+V7YbHRCUHSozzMtegr8JbAjRseLDYXCqSk8YXx\nPqk/D/X95FUgOZHkc9hDiL5BadQ+AS+BxMSXDpDA5JvVblbD+HhpfKyU8uFiCncCKWr6ZQUL1wEm\nHNE04gQSq+is1LDis/POO09Dmkk+P68XgeQKSGflc5Z/Uufm3B/oyKkTVr1ZzebnrFwz0DHZbtWq\nla0PJlsMTkyICUzace/AOhedoDMpwkUIcctKHoGVR0QrLg5MgApDnEBiJSwpftJnxYo8g55rPy4e\nP8c1ifQItD3aKqv5uN7FlQtREPd+Bpyk9sYgieDDVZTgJrwIMyYMaQQS1lZWIaMBjs5/HGHEoOkW\nBIhXKJBw0UG4EiqtDwZf6nKbbbax6SH0sEghBhlQeRdiMynETRDz2C36LDpE809/hwWNQF+GeGAy\nmNRGWAjCCuc2TGM5wRWS5wjwZmGC9o8rHqvDTOoQxYgxJ5DSfItOIJXzLTFZTGrLtIFoG0tqIzAt\nJZBwicILoDBg3WAxLfqecttNntuga3cIHMY/t9iAWGLCiwXZTbzxjEB00PcSsIC78TPpefrDpLaI\niEi7NyVtP4CoxcLl+nlc82jffBMsroSo67hvDosVHgz07Vg3+c5wM3bCxS3Cwo32QH4Y56LzAxZV\n4+YpWIXjxpXoHqSoQOL9cfGLtV8EUtz8iQUKRI/rZ5lLsWjgFpFJk7pn4Y40SgmkpPGFsiT1540g\nkFzd+M6vyu2T9Fx9EPASSEya3Mo3fzNRHjVqlO2sWNmho2byx4QuTiDhlsc+BzcJZ6LAapATSEwq\n3O9YvWHSy8SNySx+1m5CTmfCZILV2HoRSKEtSLijYYL//vvv7UQEd0jXCdPhwpdVa1YrmeQw2WLA\ncfWbJJAQWohjJljUAXXOQEwa7P2hjTBgfvXVV3Y/DRPEOIHExCApftKnRX7Z70a7IWChRPAxmDAg\nur0/TL7Yl8bgg895XLnIV9z7GWCS2hsTR1wGESUEZ6liRY88pBFILCpwzCmuVQSscbRtVpQpDyu5\ntHMmUgyUhGICqdL6YNWVwZfVWAILD7QX2kWajfahBtdKu1NW0mkD0VCpBYlvh/phokU/xzfCv5lQ\nJbURxDNt27UR2hkcnXh3AmnSpEl2goyIX3XVVe0iE66aTiCl+RadQCrnW+L7TWrL7LOLCpekNsJq\ndymBhJDHTcidXEX/QN/E3iy3b8mJ/XLbQDXbYNp2V3jIQpxAQkTjYoabGIH+DKuEc7GLnoLnnkc4\nJLVF+qe0AiltP4Clhne7fh4rPZ4drv+ttK6TvjnaCO9m3KLd0z+xkOhOAWRRlvEuSSAxH0mapyD4\n4sYV+uk4CxLfR1x85jJJAXETN3/iO4/2s3hH4KZOv8P8hrkRVmm+NcaIUgIpaXxhDEnqz0N9P3m3\nINHGfU4bLrc/0nP1Q8BLINFJMfnkQ2NijGUCqwTuVnzodCZMCPBnxsUEl4HCzYNM1EkD8zCdHR0q\nnQCrO6wc4TZF50BnwIoRHx1+2QymPMvqOh0IgywCCr9g/s0kPU3I4x6k6PGYofYgwRaRwB4dVqip\nGzp1BnQmwbBlFQwhQ0dcTCBRFwcddJDddwZ7BiKsOKSHhQ9hwKqaW/1kAolViUGJDaPR592kDiGW\nFD+pHplMImrws8ZNB2sL7Y1ykg820jIRwRUPwYL1KKlc7vAKhGA0v+zdSmpvtFtc7HgPgyd7BNgs\nS9unHTOZcYcnJJWB+mACRBoIIbhiVWWBgboh/7gt4qbFZBnrQJQfVr/o5LXS+sCCwXcND6wYWI+Y\nuPNd14pAYvLN5IR9Z4RQe5Bc+2byR19HW6JPo00ntRFcx9IIJNJkNR5BxeoubjXsocQamfZbrORb\nci52cW2ZbyLaxpLaCN99KYGEJYwJPG6FThRhVeI7KXxPmv47Lk6oCZ7v+33aXRqBxMITCxVMhmlv\njJ1YRooJJKznSW2RdpVWIKXtB1jgpN9m4Yn+jkUr6pH2zAJnpXWd9M0hiPBWmWeeeex4xh5WXDuL\nCSQW8Nz8gPlI0jyFth43rjCGuucZ85ylj8WNuPjFXJERSHHzJ9z1ov0sIoqxmvkO7rnMd6hj6hJr\nXSmBlDS+4CXQqAIpOr/SHiTfXq6x43sJJFZwnHsQfsecILPQQgvZDYOPPvqoJcmgyUooEy06BfZ5\nOPchOjg2ozt3E9KgQ0QgMZAyGcQiwESNDomNigymTELZfO8CnTOTYILbvMqA26ZNm5K1mTeBlNUp\ndqyu0Yk7qxuC1lmHEEWISyb4uNkQ4iZlWDRYxWQ/EvGY2LOahcWBwMBM/eJWwQDJoRlM+AlMhphI\ncuBD9HnELKuO/A4hHRc/qRJxaWPQcHco0e7IE/uB8OVn0kd5ERy0TawxxcqV9P6k9kb6TtDgTko7\nRdjAGaYIHFwx4twMXZmYxLOBn2/JcYIxgyeMca1jgGSwZbKMKGPC7/gzYcK/ngGPwCblSuqDTd5M\najgVyuWHyQgTkTR32VRrchptI24VP4tT7PhucLtkMYG6xcLNATNJbYQFnlICCQsdlkrY45pJYGKM\ni5U7wa7QghT3LVbyLRVry/Sv0TaGhTOujbg9LnwT7Okr3IPk7kFiscRZ0HiGFXi+mcL3lOy8EyJU\nqw36tLtSAonvGksMTBjz6F9gzh4fhEfS87SbpLbIAkt0/C3GN6mO4/oBFsdYrCQwZtN38C4WqkLU\nddI3x/t4L+/H0kY/mSSQWHggP25+wLyCBb64eQp9ady4ghhxz7PPFQsU9YR4TYqfxJi5UNz8ie+h\nsJ+l/lmgYixDDLKQghhFnDLWOWuZ24PEXlRXz0njC3OjpP481PeTRwuSjvout1fVcxDwEkhMEHAN\nYlBnc3o04FJCZ8oqC65N/CF+XOAjZqWED71Dhw5NhxIwAWYFlo+50K+cFSAmj85qFE0XixMrPWlC\nngRS1vcgIUy5lJULXqkXF3BjYBUQsRt3dHMcR1awEEdYm9gbhiBgQKeeaA/UPb9jwGKFDeEcDdHn\noz8vjM9qHnVdGHgPm5EJ5J93FbYv8sSAQnnT3uyelN+49sakkokewgWuiHgGJxdYAGADP4NoXMC9\niokxCwKIRDiy4TnNHookfryn0vpgZZJvCwsGLmRp8uPKF2pwTfPtxsVhFR/rRLGVwUrzSF0xOSn8\nVor1SWnKQzvAPZPviPbKd0MdIPCLhRDfEhO+Ym258P2VtBHSoo9gou+uFEjDJ22cSus37Xui8bJo\ndyysMP4xKadMTNydtblUHku1ReqPxchi/ZJPHSNQaK+IhcL+olRdM/4n9ZEsOjF2JH1zLOrRz99y\nyy2lkDT9Pjo/KDZPSRpXkuYX0fil+HKKZbH5U2FhYISogy97l3yC7/gS6vuJWmqK5Rfvi3ICXiI+\nIU9zPZ98K25+CHgLJGdxCFEEDgGICqQQaZZKo/BI4FLx0/7e17cVf1g++MKJnW9n5RufQxewJDER\nc3t50paxOeJhHWL1tDBQTvZMVDM4gYSlLCkgVnBXiQsIKtzn6in4tr9qlL0W8pgFl2LfEkIYgVSs\nLWeRpyzSzGv9+uaLRQomxM5dGWsSLr+cEFhpyFO/hDhnHIoLuGwyJygMCAb2CjH/cK7ilTIJ+Xwp\nvliP3CE/Id8bIi3fdpr0zuh1JcXy5fYj+uQ9bq6UJh9yqfOhrLjTzDunpDzeiX0fmJzLvasiDj0r\nQbgHsNG8loPPSkW0E4n7eH07K9/4uPew+ocbg3O/q2X2zZl3rFtMKIv5mjdnfvLwLt/2V40810Ie\nm5tLPbXlvNZvOfnClZKxFss2bojss1Qw1k0f13zcqovdM5dXVlnMn0KVtZx2mvTukGm5d/jMr7Tf\nKFSrUDoQSG1BEq5kAmk/4DQfr28H4xtf9SgCIQnUQvurhTyGrJNGSyuv9ZvXfDVa+1B5ixMI2U5D\npuUrkLTfSC09NAEJpABE0wiktB+vbwfjGz9AcZWECDQRqIX2Vwt5VJMqn0Be6zev+SqftJ6sRwIh\n22nItHwEUtKWhXqsL5Wp+QhkLpDwzWUDo+9Gw+ZDUPmbSgmkUr+P5sC3g/GNX3lplYII/I9ALbS/\nWsij2lT5BPJav3nNV/lIVhEUAAAgAElEQVSk9WQ9EgjZTkOmlUYgldqyUI/1pTI1H4FMBRLHU7K/\niI2mHAPKCV4nnHBCquODmw9B5W9KEkDlfLy+HYxv/MpLqxREQAJJbSA/BPLaB+Y1X/mpOeUkDwRC\nttOQaZUSSGm2LOSBr/JQuwQyFUiYPTfffHMrirhXAysSx3SnuYCylpDGCaRyP17fDsY3fi1xVV7z\nT6AW2l8t5DH/NZ3fHOa1fvOar/zWpHJWDQIh22nItBwL5pEcaBU95jvtloVq8NQ764dAZgKJ4zi5\nIJRjS7kVmlOTuDuJCz2jAolTwY477jh7uSYX43FSDRfA1VIoFEiFl7/6lMW3g/GN75MXxRWBUgRq\nof3VQh5LcdbvkwnktX7zmi+1JRGIEgjZTkOmlSSQtN9I7be5CGQmkLgIbtttt7V/uO2bC964FJO/\nnUDictcFF1zQ9OzZ0+y44472ngOO8xwxYkRzlT/Ie6ICKe7yV5+X+HYwvvF98qK4IlCKQC20v1rI\nYynO+r0EktqACGRBIGT/GDKtQoHE/xFHvndOZsFMaTYGgcwEEvi49O3AAw+0F9117959GoE0ePBg\nc+mll5rPPvvM3saNFWn55Ze3t4kjnGolOIHEniOfC83iyufbwfjGrxWmymdtEKiF9lcLeayN2s5n\nLvNav3nNVz5rUbmqFoGQ7TRkWlGBxL+ZX0kcVauVNOZ7qyqQrrrqKtOvX79pyL/99tumXbt2NVMj\n5e43CiWQagaUMlqXBFLeNV21sjNoK9Q3gTy2QbW7+m5z9VS6UN9PlgLp6aefrhi5vsmKEZpQbaXy\nnGSfQlUF0u23326GDBliVwYIf/31lxk9erRZa621zIwzzph96QO9wecY71KvzKKDKfVO/V4E6oXA\npEmTzIcffmgWWmgh06ZNm3oplsphjPn777/N+PHjrYv2fPPNZ6hrfsb/55hjjiZGDOCaCKnJiEDz\nE8hi/uI8c0KUJov8hchXraTRaPyqKpBwreNghuHDh1tRxGEOWJW+/PJL07Jly1ppM0Hz2WgNMCg8\nJdawBJ544gm7l3Hs2LFNDGaffXaz1VZbmb59+wY/+GWLLbYwjz/+uH1Xlitqlbznq6++Mt99953N\n40orrVTTfepJJ51kLrjgAluW1Vdf3bz66qtmlVVWMW+99VbT//ndUkstZT7++OOpftawH4UKLgLN\nTCDv85e856+Zq8v7dY3GL3OBdNBBB5k999xzmj1IY8aMsfciHXvsseayyy6zFTX//PObO++803Tp\n0sW74urlgUZrgPVSbypH9Qice+655pRTTknMwCKLLGIt1SuuuGKwTG666aZm6NChmQukSt7D6aDs\n8SR88803ZoEFFghW/uZMCKtRNO/FBNKiiy5qvvjiC9O+fXvz5ptvNmc29S4RaHgCeZ+/FOaP62cY\nFzp37mzv6owGrqhh3KAfoT9JCp9++qnhNOaddtppmihPPvmk2WOPPcy3336bqm38888/0yxksdDH\n3vY+ffrYU6GzDsXKk/f6Dc0mU4GUNrOsctKAll56aTPzzDOnfawu4zVaA6zLSlShmo0Ag9eqq67a\n9L5DDjnEYHX55Zdf7MLLyy+/bH/HABPCh929qBLh4gOnkvfUi0C67bbb7EE/hLPOOsscfPDBdjGN\nQ34mTpxo5plnHgMnggSST+tSXBEISyDv85fC/HH9DNZ1Agsr3NNJmDBhgnXjJZQSSA8//LA56qij\nzEcffTQNzB9//NEePtahQ4dUoJ1Auueee+whZ4Tvv//edOvWzSyxxBLmqaeeSpVOJZGKlSfv9VtJ\nueOezYVACl2oWk6v0RpgLdeV8l59Ahz76vYwcpkgpxy58Pvvv5vFF1/c7lshcPUAizCETz75xE62\nWfljkGSFEDdf9hO6QdKlM3LkSHPllVdat65lllnGHHDAAebaa6+NtSBxRQEufa+//rpdCVx33XXN\n/vvvbzp16lQSls97brnlFjNo0CDDgTZzzTWXHUx5xw477GBatGhhTj31VPt7ykbYZ599zEYbbWSt\n+YRSz8dlFlfCXXfd1f6qa9eudgIxcOBA8+KLL1p3NyYJ0XLyzt9++816BHDFw/XXX2/3DT3yyCOp\n6wDrIPFfeOEF+8x2221nBfFpp51m6wo3baxL5513nv19kkBKUy+l8luyAhVBBBqcQN7nL0kCCQHD\nIkyPHj1sDd58881mwIABtt9xAinuzk6updl4442tWy99b+/evQ191jrrrGNY2Lnhhhts30RfzH7J\nc845x1xzzTVm3nnnNYceeqg5/PDDp2oxTiBheaK/doGx6rrrrjOff/55U/64M3Ty5Mm2T6e//+GH\nH+x1Odtvv719BwEPAsQV4aGHHjK4KpMGfTIHpOFdcdNNN9nxkK0t9Nf05648995771T5y3v9hv78\nJJBCE60wvUZrgBXi0uMNTMBt0HcIGCC4Wy0annvuOSsiCIgp3Hr5GRP8n3/+eRp6uDNgaVpjjTXs\n71jJ23nnnYtSdnuQ2D9ZOOC5BxloGECTgs97TjjhBHtnXFw4+uijreUM0eCEoYuHAGAwTPN8XNqU\nE/FFgFMcv7vvvrvJ1cQdlIBbCAMuARHKRCNtHVBf0X1lpMGgziDv9iCRvlu9jRNIaeulWH4b+DNT\n0UUgNYG8z1+SBNIll1xiHnzwwaY7OBEVjBdY4RFI9Kdxd3Zixb7iiius+xt7UvFcYLGKBaFevXpZ\nq49zsUNw8TO2kWBZQpDRL7Zt27aJrxNIjBfrr7++3d/KdhQW5RBMV199tX0PXhIsxK2wwgpm3333\ntdfp7LfffnaByO3nZ18uC3tYoFgcxFLGIhaLXGeffba1vrMoh4DDRZ3zAPg9i3uuPLgzR0Pe6zd1\nQ00ZUQIpJajmitZoDbC5uOo99UeAVS/nk+32pZQqJaKKibWzrOB7zoobJ2q+9tprTZN4VtGwQDHA\nOaGBjzmTcywhUXHAIIYlA2uVEw8MrJzKyeDp4rLaN9tss02TRVbt0r7njz/+sJYb0qTsxx9/vD3F\nDSuVezdWmkcffbRpMOXnDM6rrbaatZSleT7uFLioQCJN3g8TuLn9WLi+MehTzsI0+B3CFH5p62DY\nsGF2JZYVXQKDOeIVl7o0AsmnXpLy695dqm3p9yLQ6ATyPn9JEkiIEO7gxIoyyyyzWLddFmYQOggk\n+qGkOztx43YudggOBJLzVojuQVp77bWtNQfrN6F///52/5NzpeNncXuQXJvidFYOodl6662tizH9\nKOHGG280F154oXW/QyA9//zz1qWP/hqPCRbFEHIs/PE7AuVFXLHf6NZbb7Vii/tHWQCTi93/vmIJ\npJz1aHnvYHKGS9lpYAK4TbHKRuC0OlwISgVOzGQ/EgE3g6hPt5tw87tXXnnFCiNW6grT5yqClVde\nuelVDERRKwXud6zoEdwdafz7jjvuMLvttts0WXQrgmneg588gxmBwZbBFaHIgOiEHG4fWHji9iD5\nPF+Y0UILEoMsR6kzqHfs2LFpvxebnRGeUcFx1113mV122cUm6VMHiCFcQTihkOAGf/6dRiD51EtS\nfku1Kf1eBETg/wjkff6SJJCwprBQxoJPq1atbH+OKJp11lmtQMK9LenOTvrfqEDacsstDX0wISqQ\nuIoAl7u4wxxc+4nuQXIWJBZ5GKuwROE6F2dVp79n0Q+BxP6p1q1b2yQZv9Zbbz3zxhtvWNdxdyAa\ni3KUjXEO8cSzLBISJJD+9zVLIOWsZ8t7B5MzXMpOAxNwq2AgQCC41bFiSBAvbPIn4KaGy4MLJ554\nol2JI2C1YJBzPum4NuAz7gITd2dxQjjg2oZbAoHBilVIQtTNDTcGXBsKA4Ne2vfwLAfaIDgQVqNG\njZrG1a2YQPJ5vjCfUYGEJeixxx5rioKPPHvACOxvwn3ECQ54uAkDv/epAyYFlQgkn3pJym8Df2Iq\nugh4Ecj7/KWYQML1jT2V9FcIEsYJJ5BwgU66s5OfpxFILGbhZn3kkUdapuxLWmyxxcyaa67ZxDhp\nDxJjEwtP/GExCvHkLFG467lTShFICDY8Euiv55xzTjvOIZoQSW5PEW7neBPgJYF7oQRSfDOXQPL6\n/LOPnPcOJnsCeoMIpCPgVsGcKEE4OGHiUrjvvvuahFP37t3tih5XCxBwTcB/2wUGCieYmMQz6LhB\nKLq3hvgcvsA+GgIDEek4VywG2LiAFYQNvIUB0ZT2PQguBki3p4e0NtlkE7sHx/2smEDyeb4wn1GB\n5PYzuTi4EjJJIODSwe+d4Cg8QpdVzLR1gCWuEoHkUy9J+U3XGhVLBEQg7/OXYgKJ/UPugB76Uy4b\ndwKJsSDpzk4EEvt/cM9jkS7JgsShCiy8MSbxrnbt2lk3Pg7+cSFJING/Ml5hzeJABsYjFqgYa7hK\nh3GPe0QRSBzEwIIVi2gsVOHxgGs2nhM8w9hFnHfeecd6XTAmRQUS7tmuPIX3kea9fkN/gRJIoYlW\nmF6jNcAKcenxBicQteTgI37MMcc0EWEQYpBze4BwL2MDKu54BE7/wf/aBXfvBf/HvYL4iCqCO/yA\nf5Me7hIuIBzYG8MJRoRSx8IWVhl5SPsexAeDFwHBxeogVyNEORQTSD7PFxNIhXu+cE+5//777SP4\nujMYO8FReMQ6p9KlrQM2SlcikHzqJSm/Df6JqfgikJpA3ucvhflz9yCxb5M+HfHAwhsnlroFONef\nJ93ZyYXc7O/kNFFc6HBri3OxY3EKyzueBwgbLEmFHgVOIOH6Hb0P1F11gHWI92CJYowicCgDliH2\nDyGQ6JvdO/BaYJ8qY9Tuu+9uRZPzcMCVjn1R9JFwcGNhtDwcUR4Nea/f1A01ZUQJpJSgmitaozXA\n5uKq99QnAU7qYWXPBU5oYwWPAY9VNHcPknMJi95vwTNMvtn0imuFsx4xgCCOWBF0l8vyMyxMuENc\ncMEF9nS7qECK7iNyBxFMP/30VrCx94jAQQYcCVsY3CDNz0u9h9VDjoolMMCxD4mBMnokrBvso3uQ\nGDAZxH2eLyaQ+B37q9hXxOAcvaiXe+04xjZJcPjUAZOWSgSST71IINVnH6FSNR+BvM9fKs1f0p2d\nHP6DoEryHojWAGMLhyzMOOOMZVcMgocDFrAMcZAE5SJdBBL7qRgDOIyn0KMC8cMCH4c9FFqHoplJ\nKk+l/MoucJUelECqEvik1zZaA8wZfmWnBglE3ajiss9ghP811iQClibEQ1LgdCG3T4nT4XBtKBYY\nrPizzTbbWNESF+Juao/GS/sehJy7i4jnGRwLj8HmNCKOpI26shGXS3Sx5qR9vpRAiitndJ9VMcHh\nUweVCCSfepFAqsGPX1nOFYG8z1/ynr9KKtMJJLdAVklamp/+HwEJpCxaUQVp1vMHXAEWPSoCRQng\nOobLQuHdPLgiIBTYDBsNnEqE73ZhwM3AXabK73799Vdz2GGHNe0v4mecQjTTTDNZf3KCuweJlTsO\nKnCnzLm0ESQc8lB4R1P03WnfgwsGVinK68qKxYwNt+eff75NEksX7iCcfsT+JCeg2BfE0bBpny8m\nkLCS4dKBv7oLWO9wGeHYcTu4TDed/bvwtEAXP20dRE+i4+JGLmEkuFPsEInOFYSVUfZiRV0A09ZL\nqfzqExQBEShOIO/zl7znr5L2xXjAOIM7eCXWqWJ5qGd+ceWWQKqkRWbwbKM1wAwQKskGJfDvv/9a\ntwNOt8PVgc2vXPCXFLiniLsluHSUO4yKuR3gWsEkvFSavIsJOXERM9w1UUwYFeYt7Xs4fQi3uTTp\nc8oR+7FwV3N7p3yed3mMHtKAPzvHwpI2G5rJR9wdT6Waok8dlEqr1O8rqZdSaev3IiACtXfMt+rM\nj0CjzU8lkPzaR+axG60BZg5ULxABEQhCIE4gBUlYiYiACNQFgbzPX/Kev7w3gkbjJ4GUsxbZaA0w\nZ/iVHREQgQQCEkhqGiIgAsUI5H3+Uix/9G9cFcH+zVKBAxl++OGHpmPBC+PjzTBu3Dh7wiiXaYcK\n5NGddBcqTZcOe1fdPt2ktPNev6GZSCCFJlphes4PvsJk9LgIiIAIiIAIiIAINCsBtyezWV+a8mXF\nJvhccM2pp1deeeVUqbHXkf2q7EXFHZtLW93JpOx/5NoF9l8SKDunnLIf1O0Rxd2bO4rcwT/uWgd3\nX5x7GZdi4+rNsdtJgXv8OJSIfBQG3Mtfeuklu0c2beDiWfbpIgxJd7vttrOnukog/R8BCaS0LUnx\nREAEREAEREAEREAEapJAkkBiLyXihCsX3GWxroBRgYRQ4pQ4DuLBOsSppVzGynUC3KPHNQpc6oqA\n4voJLElcg8DBP1wUy2XaCLG9997b7pPlXqPWrVvbVyGQOEzIHbYTB7iYQCIvpM+e0LRh5MiRTRfb\n8lynTp3s861atYpNQhaktGQVTwREQAREQAREQAREQARqgEDSBJ+TMrmcddCgQdOUIiqQOPQHS9Jp\np53WFI+T41ZddVV7UM0888xjOJ3zgAMOmCod4vfp08eKKyxI5557rr2naOWVVzbXXHONl0BCSHEt\nBOlwhcWAAQOsyOKOPU7wxCLE5bDcl8cF4pxcymWyiKeTTz7ZfPDBB+aII46wF5tHBRKZ4D693Xbb\nbZr8u8JIINVAI1cWRUAEREAEREAEREAERCAtgaQJPoIGyxGXXxeGQgsSAohTPLEQrbvuumbJJZe0\nj3AFQceOHe31ClzYGg24sq233nrWlW3w4MHmwgsvNHfeeae9quD55583HTp0SG1B4uoGXOFw7cMq\nhZWK6xawXCHCsGaxP4p4iCUu8z788MPN+PHj7e8JiCXc9LBgcU3ETz/9ZH/OVRn8PE4o8nsJpLQt\nTfFEQAREQAREQAREQAREoAYIJE3w1157bdOjRw/DXXHFBBLXI9xwww1WQAwbNsxGRYggdhBI3bp1\ns2ID97loGDVqlOGycCw8WG0QSO+88469kw7L1auvvmrd7tK42PG+yZMn22sVEEMIIN4ZdbHDFY94\nXO8w66yzWosTV08g1BBU008/vRk+fLgVPFGBhCWNewOT3PQkkGqgkSuLIiACIiACIiACIiACIpCW\nQNIEn58PGTLEbLrppvZycALCguAsSIcccogVIu5eO+I9+OCD9vCFAw880O4z4oLqoUOHWne3aLjo\noovsQQ5//PGHueuuu5oEEi53HPSACx4iKo1AwsUOSxTBWaY4HKJQIEXj9ezZ07Rs2dJanAjch/fA\nAw/YS72jAonysKfKWZQKuUogpW1piicCIiACIiACIiACIiACNUAgaYKPYMB6gjhATHCMN3t3COw7\n4mQ6DjBYfvnlrXXFudXxe9zzcF/jYAYsR7169TIIomhAeHEp9tNPP20PaXAWJOJgfUJktW/f3rrh\n+RzSUEwgRU+7o0yIIaxDxQTSjTfeaPdQ4fYXFySQaqCRK4siIAIiIAIiIAIiIAIikJZA0gQfAcP+\nIfYgIYYQCS+++KI99nuLLbawgoEDDxZZZBFrHeJgBUTVu+++a7baaivTvXt3c/rpp9vnzzjjDHuA\nAsdl//3336Z///72QIRnnnnGrL/++tMIJKw//BzXO44QRyDdfvvt1pqEYIqGwlPsogKJfUjsS/ry\nyy/tu8oRSJzCh9jj4AcJJB3znfa7UjwREAEREAEREAEREIEaJZAkkDhV7q233jIDBw60AgNRxP8J\n3CuEYMECwz4jXNfYS+T2GXEQAiJopplmsoKItBBJLnDSHIIJEUYotCDxs9GjR9sT7ZxAQoyxhwix\nllYgffXVV2a11VYzc801l7WGlRJIuNNRJsrqXOp23nlnK8qwOEkgSSDV6GeubIuACIiACIiACIiA\nCKQlkCSQuGQVgRK9l2jcuHFmlllmsS520YAI4nfsJ+Jo7xYtWkzz+j///NOMHTvWWpmwBPmG+++/\n30yYMMHubfIJ5A33wMJDItKk4e6CGjNmjGnTpo0Eki6KTdNsFEcEREAEREAEREAERKCWCRTbQ8Md\nQVyQyt/VDrjqcVeROxCiOfKDax+iL2r9Knyv9iA1R03oHSIgAiIgAiIgAiIgAiLQTASKTfA5lY59\nPLjUNWLgdD32TbnT++IYSCA1YstQmUVABERABERABERABOqWABN8hcoIcKhEo4TppjRSaRulVlVO\nERABERABERABERCBJgKNZgEJXfWNxk8CKXQLUnoiIAIiIAIiIAIiIAK5ItBoE/zQ8BuNnwRS6Bak\n9ERABERABERABERABHJFoNEm+KHhNxo/CaTQLUjpiYAIiIAIiIAIiIAI5IpAo03wQ8NvNH4SSKFb\nkNITAREQAREQAREQARHIFYFGm+CHht9o/CSQQrcgpScCIiACIiACIiACIpArAo02wQ8Nv9H4SSCF\nbkFKTwREQAREQAREQAREIFcEGm2CHxp+o/GTQArdgpSeCIiACIiACIiACIhArgg02gQ/NPxG4yeB\nFLoFKT0REAEREAEREAEREIFcEWi0CX5o+I3GTwIpdAtSeiIgAiIgAiIgAiIgArkikDTBf++998yK\nK65oOnfubJ599tmp8rz55pubIUOGmDfffNO0b98+sTyffvqpeemll8xOO+00TZwnn3zS7LHHHubb\nb7/14jFlyhRz4IEHmoMOOsisvfbaJZ/97rvvzPnnn2/z+p///MfsuuuuZrnlliv6HO+45pprzODB\ng838889vtthiC7P99tvHPiOBVLIKFEEEREAEREAEREAEREAEaodA0gT/3XffNSuttJItyBdffGEW\nXnhh++8JEyaY+eabz/67lEB6+OGHzVFHHWU++uijaYD8+OOP5v333zcdOnRIDevRRx81jzzyiOnf\nv7957LHHTNeuXUs+u8Yaa5i///7b9OrVy7z22mvm8ssvN99//72ZZ555Ep+9+uqrzZlnnmmOO+44\nM++885r99tvPDBs2zGy44YbTPCOBVLIKFEEEREAEREAEREAEREAEaodAKYGEgNlrr71Mjx49bKFu\nvvlmM2DAAPPCCy80CSSsRIgJBM9mm21mRchPP/1kNt54Y/Pxxx+bHXbYwfTu3duce+65Zp111jG3\n3XabueGGG8x5551nBg0aZAXMOeecY602CJJDDz3UHH744dNAxGo0adIkc/fdd6cSSOPHjzcLLLCA\n+fLLL02bNm3MP//8Y+aee27Tt29fs88++yRW0tZbb226dOlixR1h0003Nauuuqq56KKLJJCmYF9T\nEAEREAEREAEREAEREIE6JVBKIF1yySXmwQcfNCNGjLAEunXrZi0pCCIsSAiQBRdc0PTs2dPsuOOO\nVkQgjnBPu+KKK0yfPn3M448/bn755Rfr4rbssstaa84SSyzR5GKH4OJnd955p8GyhCBDWLVt2zaW\n+hxzzGEGDhxY0oI0efJk89Zbb5lOnTrZdMaMGWNWWGGFkpYvhFSLFi3Mhx9+aJ577jmz7777mqFD\nh1rBVxhkQarTD0PFEgEREAEREAEREAERaEwCpQQSomL55Ze3VphZZpnFuqaNHTvWCh0EEq5nl156\nqfnss88MaWFFIv7XX39tXn755SYXu5EjR1qB9MEHH5ill17aRPcgsZcI4XXaaafZSsCFjv1PxI8L\naQVS9Fn3vrXWWsu66aUJ3bt3N7feequZffbZ7T6suP1WEkhpSCqOCIiACIiACIiACIiACNQIgVIC\naeLEifaAAg5aaNWqlbnqqqusKJp11lmtQLruuutMv379pint22+/bT755JOpBNKWW25prUuEqEBC\n8OByF3eYQ6UCCfe9I4880rCvCJc+LFUtW7ZMXTsc8sBhEn/99Zd5+umnp3lOAik1SkUUAREQAREQ\nAREQAREQgfwTSCOQcH3DpQ1LCntzDj744CaBdM8999gT7Z555hlbWITE6NGjDZYafu4OacCClCSQ\nsBSxTwkhQ2Bf0mKLLWbWXHPNWIA+FiTEDflh31Kp0+vcyzp27GguvPDCJgsWLoDskfr8888lkLQH\nKf8ftXIoAiIgAiIgAiIgAiJQPoE0Aon9Q+4UO06kW2ihhZoE0jfffGMPZhg+fLgVRRdffLG1MuGS\nh0DiBDj+/fzzzycKJA514OCG++67z+5VateunXXjW2aZZVIJJPYrPfTQQ/YgiRlnnLHpGY4ZZ68T\n6boT+fglR3djAeNQCA5kWHLJJad6z/7772+wHHGYA9YmjgZHGJ511lkSSBJI5X9selIEREAEREAE\nREAERCD/BErdg8SpcVhs1l13XfPbb7+ZV1991f7tXOzYl3Pssceayy67zBYW8YHFCUHx1VdfmdVW\nW83MNddc1oWO+4TiXOw4bY4juzmGGysVlqSzzz47EV6hBYnjv9nDhDvgnHPO2fQcwoiDIwoDAm73\n3Xe3J9rFHRdOPsgP+SLg+sdhFVi1CoNc7PLfxpVDERABERABERABERABEUhNINQEH4sLl75yAMPM\nM8/c9H72ACGoED6lAvctIbCiVqBSz/D7P//801qCOC2P8qQNRx99tL1wlpPtCgNpsocKaxmCLCmE\n4pc2z9WON50sSNWuAr1fBERABERABERABEQgSwL1MMHntDwusE1zcaxj+euvv1oXu2OOOcZLVBXW\nRT3w82lfEkg+tBRXBERABERABERABESg5gg02gQ/dAU1Gj8JpNAtSOmJgAiIgAiIgAiIgAjkikCj\nTfBDw280fhJIoVuQ0hMBERABERABERABEcgVgUab4IeG32j8JJBCtyClJwIiIAIiIAIiIAIikCsC\njTbBDw2/0fhJIIVuQUpPBERABERABERABEQgVwQabYIfGn6j8ZNACt2ClJ4IiIAIiIAIiIAIiECu\nCDTaBD80/EbjJ4EUugUpPREQAREQAREQAREQgVwRaLQJfmj4jcZPAil0C1J6IiACIiACIiACIiAC\nuSLQaBP80PAbjZ8EUugWpPREQAREQAREQAREQARyRaDRJvih4TcaPwmk0C1I6YmACIiACIiACIiA\nCOSKABN8hcoITGSq6i8AAAAOSURBVJkypbIEaujp/wdcEY6/doYQKAAAAABJRU5ErkJggg==\n" - } - }, - "cell_type": "markdown", - "id": "63ecacd3-95ad-4038-b964-4aa6ae622418", - "metadata": {}, - "source": [ - "# Code Data Profiling with Data Prep Kit\n", - "\n", - "This notebook provides a comprehensive guide for profiling code data across 21 programming languages, focusing on extracting both syntactic and semantic concepts from the input dataset. This notebook guides you through running `code_profiler` transformation using the data-prep-kit.\n", - "\n", - "Pure python (run locally): this notebook\n", - " - **Input:** Parquet file with columns `language` (Programming language of the code snippet) and `contents` (code snippet). If you have raw source code in a directory you can convert it into a parquet using the `transforms/code/code2parquet` transform \n", - " - **Output:** Annotated parquet file with higher-order syntactic concepts and a graphical report of the distribution of Library, Language, Semantic Functional Category, and CCR (Code Complexity Rate) based on the input dataset.\n", - " \n", - "![short-workflow.drawio.png](attachment:a6d2929d-7cdd-42ea-ae09-edfaf87698c0.png)\n", - "\n", - "## Step 0: Setting up Python Dev Environment\n", - "Please follow instructions from [Getting started](https://github.com/IBM/data-prep-kit/blob/4171dfa00cf7bfe166fd30072ad8ec4c8cde6967/README.md#gettingstarted) section to setup your python development environment\n", - "\n", - "## Step 1: Input the dataset information to the `code profiler`\n", - "\n", - "a. If you have the input parquet file on your local machine, copy it to the `code_profiler_input_dir` folder. DPK also supports downloading input parquet from the COS." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a665322f-d7ef-4157-b05c-fde987725082", - "metadata": {}, - "outputs": [], - "source": [ - "# Specify the target directory where your transforms Makefile, input and output are located\n", - "user_local_dir = '/root'\n", - "\n", - "transforms_dir = user_local_dir + '/data-prep-kit/transforms'\n", - "\n", - "'''\n", - "Copy your input parquet file containing the raw source code to `code_profiler_input_dir`.\n", - "Assign the parquet filename and specify the column names for the raw code and language in the file\n", - "transforms/code/syntactic_concept_extractor/input/data_profiler_params.json.\n", - "'''\n", - "code_profiler_input_dir = transforms_dir + '/code/code_profiler/input'\n", - "print(code_profiler_input_dir)" - ] - }, - { - "cell_type": "markdown", - "id": "88d0dea6-e41d-41e8-b317-ccf16e81e337", - "metadata": {}, - "source": [ - "## Step 2: `Code profiler` extracts the syntactic and semantic concepts from the input dataset\n", - "\n", - "This cell has some utility functions for executing the transform.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "00bf1238-ac10-4b91-be52-99ac49dd9249", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "import os\n", - "import subprocess\n", - "from IPython.core.display import display, HTML\n", - "import shutil\n", - "import pandas as pd\n", - "import json\n", - "from IPython.display import display\n", - "\n", - "# Following are the utility functions for running the transforms sequencially\n", - "\n", - "def run_make_command(target_dir, command):\n", - " \"\"\"Function to change directory and run a make command with real-time log output.\"\"\"\n", - " if os.path.exists(target_dir):\n", - " # Change the current working directory to the target directory\n", - " os.chdir(target_dir)\n", - " print(f\"Changed directory to: {os.getcwd()}\")\n", - "\n", - " # Run the make command and stream logs\n", - " try:\n", - " process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)\n", - " for stdout_line in iter(process.stdout.readline, \"\"):\n", - " print(stdout_line, end=\"\") # Print stdout line-by-line in real-time\n", - " process.stdout.close()\n", - " process.wait()\n", - "\n", - " # Check for errors and handle stderr\n", - " if process.returncode != 0:\n", - " print(\"Error Output:\")\n", - " for stderr_line in iter(process.stderr.readline, \"\"):\n", - " print(stderr_line, end=\"\")\n", - " process.stderr.close()\n", - " else:\n", - " print(\"Process completed successfully.\")\n", - " except subprocess.CalledProcessError as e:\n", - " print(f\"Error occurred while running the make command: {e}\")\n", - " else:\n", - " print(f\"Directory does not exist: {target_dir}\")\n", - "\n", - "def check_directories_exist(directories):\n", - " results = {}\n", - " for directory in directories:\n", - " if os.path.isdir(directory):\n", - " print(f\"The directory '{directory}' exists.\")\n", - " results[directory] = True\n", - " else:\n", - " print(f\"The directory '{directory}' does not exist.\")\n", - " results[directory] = False\n", - " return results\n", - "\n", - "def display_html_file(hosp_code_dir):\n", - " # Construct the path to the HTML file\n", - " html_file_path = os.path.join(hosp_code_dir, 'src', 'output.html')\n", - " \n", - " # Check if the file exists\n", - " if not os.path.exists(html_file_path):\n", - " raise FileNotFoundError(f\"The file '{html_file_path}' does not exist.\")\n", - " \n", - " # Read the HTML file\n", - " with open(html_file_path, 'r', encoding='utf-8') as file:\n", - " html_content = file.read()\n", - " \n", - " # Display the HTML content in the notebook\n", - " display(HTML(html_content))\n", - " \n", - "def display_value_counts_from_json(file_path, metric_index, column_name):\n", - " \"\"\"\n", - " Function to load a JSON file, extract the value counts for a specified metric,\n", - " and display the counts in descending order.\n", - "\n", - " Parameters:\n", - " file_path (str): Path to the JSON file.\n", - " metric_index (int): The index of the 'metrics' list from which to extract 'value_counts'.\n", - " column_name (str): The name of the column to be displayed (e.g., \"Library\", \"Language\", \"Semantic Functional Category\").\n", - " \"\"\"\n", - " # Step 1: Read JSON file\n", - " with open(file_path, 'r') as f:\n", - " data_dict = json.load(f)\n", - " \n", - " # Step 2: Extract the \"value_counts\" field from the JSON structure\n", - " value_counts = data_dict[\"metrics\"][metric_index][\"value_counts\"]\n", - " \n", - " # Step 3: Convert the \"value_counts\" dictionary to a pandas DataFrame\n", - " df = pd.DataFrame(list(value_counts.items()), columns=[column_name, \"Count\"])\n", - " \n", - " # Step 4: Drop rows where any value in the row is NaN\n", - " df_cleaned = df.dropna(how=\"any\") # This drops any row where any column has NaN\n", - " \n", - " # Step 5: Sort the DataFrame by \"Count\" in descending order\n", - " df_cleaned_sorted = df_cleaned.sort_values(by=\"Count\", ascending=False)\n", - " \n", - " # Step 6: Display the cleaned and sorted DataFrame\n", - " display(df_cleaned_sorted)\n", - "\n", - "code_profiler_code_dir = transforms_dir + '/code/code_profiler/python'\n", - "code_profiler_output_dir = transforms_dir + '/code/code_profiler/output'\n", - "\n", - "directories_to_check = [code_profiler_input_dir, code_profiler_code_dir, code_profiler_output_dir]\n", - "check_results = check_directories_exist(directories_to_check)\n", - "\n", - "print(\"Checking directory exists or not\")\n", - "for directory, exists in check_results.items():\n", - " print(f\"{directory}: {'Exists' if exists else 'Does not exist'}\")\n", - " \n", - " if not exists:\n", - " if 'output' in directory:\n", - " os.makedirs(directory, exist_ok=True)\n", - " print(f\"Directory '{directory}' did not exist, so it was created.\")\n", - " else:\n", - " raise FileNotFoundError(f\"The directory '{directory}' does not exist.\")\n", - "\n", - "def clean_directory(directory):\n", - " # Check if the directory exists\n", - " if os.path.exists(directory):\n", - " # Change the current working directory to the target directory\n", - " os.chdir(directory)\n", - " print(f\"Changed directory to: {os.getcwd()}\")\n", - " \n", - " # List files before cleaning\n", - " run_make_command(directory, ['ls', '-la'])\n", - " \n", - " # Attempt to remove all files and directories, including hidden ones\n", - " run_make_command(directory, ['rm', '-rf', '*'])\n", - " \n", - " # List files after cleaning to verify\n", - " run_make_command(directory, ['ls', '-la'])\n", - " \n", - " print(\"Directory cleaned successfully.\")\n", - " else:\n", - " print(f\"Directory does not exist: {directory}\")\n", - "\n", - "# Clean the output directory\n", - "# clean_directory(code_profiler_output_dir)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "42fb20dc-b56f-43b9-b936-d47ed901d430", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# runnning the syntactic_concept_extractor transform\n", - "run_make_command(code_profiler_code_dir, ['make', 'venv'])\n", - "run_make_command(code_profiler_code_dir, ['make', 'run-local-python-sample'])" - ] - }, - { - "cell_type": "markdown", - "id": "a08dd030-63c7-4bc0-9e0f-184f6af82b2f", - "metadata": {}, - "source": [ - "## Step 3: Metrics report generated from the output dataframe of the profilers\n", - "\n", - "##### Goto the folder `transforms/code/code_profiler/python/src` to view the JSON file (output*.json) and it's rendered graphical html report (starting with output*.html) showing the distribution of the following example metrics. \n", - "- Library\n", - "- Language\n", - "- Semantic Functional Category\n", - "- CCR (Code Complexity Rate)\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/transforms/code/code_profiler/notebook_example/workflow.drawio.png b/transforms/code/code_profiler/notebook_example/workflow.drawio.png deleted file mode 100644 index c4c10d80b..000000000 Binary files a/transforms/code/code_profiler/notebook_example/workflow.drawio.png and /dev/null differ diff --git a/transforms/code/code_profiler/python/.gitignore b/transforms/code/code_profiler/python/.gitignore deleted file mode 100644 index 17cee1df3..000000000 --- a/transforms/code/code_profiler/python/.gitignore +++ /dev/null @@ -1,37 +0,0 @@ -test-data/output -output/* -/output/ -data-processing-lib/ - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - - -# Distribution / packaging -bin/ -build/ -develop-eggs/ -dist/ -eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -.tox/ -htmlcov -.coverage -.cache -nosetests.xml -coverage.xml \ No newline at end of file diff --git a/transforms/code/code_profiler/python/Makefile b/transforms/code/code_profiler/python/Makefile deleted file mode 100644 index 7cceb46d2..000000000 --- a/transforms/code/code_profiler/python/Makefile +++ /dev/null @@ -1,55 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -# values possible mach-arm64, x86_64 -export RUNTIME_HOST_ARCH=x86_64 - -venv:: .transforms.python-venv - -test:: .transforms.python-test - -clean:: .transforms.clean - -image:: .transforms.python-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-python - -setup:: .transforms.setup - -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(CODE_PROFILER_PYTHON_VERSION) TOML_VERSION=$(CODE_PROFILER_PYTHON_VERSION) .transforms.set-versions - -build-dist:: .defaults.build-dist - -publish-dist:: .defaults.publish-dist - -test-image: - @echo "Skipping test-image step as per configuration." - -# Ensure RUN_ARGS has a default value -RUN_ARGS ?= "" - -# run-cli-sample: .transforms.run-cli-python-sample - -run-local-sample: .transforms.run-local-sample - -run-local-python-sample: - $(MAKE) RUN_FILE=code_profiler_local_python.py \ - RUN_ARGS="--content 'contents' --language 'language'" \ - .transforms.run-local-python-sample \ No newline at end of file diff --git a/transforms/code/code_profiler/python/README.md b/transforms/code/code_profiler/python/README.md deleted file mode 100644 index 12a8df5d5..000000000 --- a/transforms/code/code_profiler/python/README.md +++ /dev/null @@ -1,54 +0,0 @@ -# Code Profiler Transform - - -## Configuration and command line Options - -The set of dictionary keys holding [code_profiler_transform](src/code_profiler_transform.py) -configuration for values are as follows: - -* content - specifies the column name in the dataframe that has the code snippet -* language - specifies the programming languages of the code snippet - -## Running - -### Launched Command Line Options -The following command line arguments are available in addition to -the options provided by -the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md). - -### Running the samples - -The code profiler can be run on mach-arm64 and x86_64 host architecture. -Depending on your host architecture, please change the `RUNTIME_HOST_ARCH` in the Makefile. -``` -# values possible mach-arm64, x86_64 -export RUNTIME_HOST_ARCH=x86_64 -``` -If you are using mac, you may need to permit your Mac to load the .so from the security settings. Generally, you get the pop-up under the tab security while running the transform. - -![alt text](image.png) - -To run the samples, use the following `make` targets - -* `run-local-sample` - runs src/code_profiler_local.py -* `run-local-python-sample` - runs src/code_profiler_local_python.py - -These targets will activate the virtual environment and set up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. - -For example, -```shell -make run-local-sample -... -``` -Then -```shell -ls output -``` -To see results of the transform. - -### Transforming data using the transform image - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/code/code_profiler/python/pyproject.toml b/transforms/code/code_profiler/python/pyproject.toml deleted file mode 100644 index 4522f61e9..000000000 --- a/transforms/code/code_profiler/python/pyproject.toml +++ /dev/null @@ -1,47 +0,0 @@ -[project] -name = "dpk_code_profiler_transform_python" -version = "0.2.4.dev0" -requires-python = ">=3.10,<3.13" -description = "Code Profiler Python Transform" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Pankaj Thorat", email = "pankaj.thorat@ibm.com" }, - { name = "Aishwariya Chakraborty", email = "aishwariya.chakraborty1@ibm.com" }, -] - -dynamic = ["dependencies"] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]} - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/code/code_profiler/python/src/grammar/UAST_Grammar.json b/transforms/code/code_profiler/python/src/grammar/UAST_Grammar.json deleted file mode 100644 index 3aca3be70..000000000 --- a/transforms/code/code_profiler/python/src/grammar/UAST_Grammar.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "uast_root" : { - "keyword" : "uast_root" - }, - "uast_package" : { - "keyword" : "uast_package" - }, - "uast_comment" : { - "keyword" : "uast_comment" - }, - "uast_class" : { - "keyword" : "uast_class" - }, - "uast_function" : { - "keyword" : "uast_function" - }, - "uast_call" : { - "keyword" : "uast_call" - } -} diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_java.json b/transforms/code/code_profiler/python/src/ruleset/UAST_rules_java.json deleted file mode 100644 index a884be29e..000000000 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_java.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "import_declaration": { - "uast_node_type": "uast_package", - "extractor": "self.extracted = code_snippet.split('import', 1)[1].strip(' ;')" - }, - "line_comment" : { - "uast_node_type" : "uast_comment", - "extractor" : "self.extracted = code_snippet[2:].strip()\n" - }, - "block_comment" : { - "uast_node_type" : "uast_comment", - "extractor" : "self.extracted = code_snippet[2:-2].strip()\n" - }, - "method_declaration" : { - "uast_node_type" : "uast_function", - "extractor" : "self.extracted = code_snippet.split('(', 1)[0].strip().split(' ')[-1].strip()" - }, - "method_invocation" : { - "uast_node_type" : "uast_call", - "extractor" : "self.extracted = code_snippet.split('(', 1)[0].strip()" - } - -} \ No newline at end of file diff --git a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_py.json b/transforms/code/code_profiler/python/src/ruleset/UAST_rules_py.json deleted file mode 100644 index 400885c27..000000000 --- a/transforms/code/code_profiler/python/src/ruleset/UAST_rules_py.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "import_statement": { - "uast_node_type": "uast_package", - "extractor": "text = code_snippet.split('import')[1].strip() \nif (',' in text):\n imports = text.split(',')\n all_imps = []\n for imp in imports:\n imp = imp.strip().split(' ')[0].strip()\n if ('.' in imp):\n imp = imp.split('.')[0]\n all_imps.append(imp)\n all_imps = list(set(all_imps))\n self.extracted = (', ').join(all_imps)\nelse:\n imp = text.strip().split(' ')[0].strip()\n if ('.' in imp):\n imp = imp.split('.')[0]\n self.extracted = imp\n" - }, - "import_from_statement": { - "uast_node_type": "uast_package", - "extractor": "text = code_snippet.split('from', 1)[1].strip()\ntext = text.split(' import')[0]\ntext = text.strip()\nif ('.' in text) :\n self.extracted = text.split('.')[0]\nelse:\n self.extracted = text\n" - }, - "comment" : { - "uast_node_type" : "uast_comment", - "extractor" : "self.extracted = code_snippet[1:]" - }, - "function_definition" : { - "uast_node_type" : "uast_function", - "extractor" : "self.extracted = code_snippet.split('(', 1)[0].strip().split(' ')[-1].strip()" - }, - "class_definition" : { - "uast_node_type" : "uast_class", - "extractor" : "self.extracted = code_snippet.split('class ', 1)[1].split(':', 1)[0].strip()\nif ('(' in self.extracted):\n self.extracted = self.extracted.split('(', 1)[0].strip()" - }, - "call" : { - "uast_node_type" : "uast_call", - "extractor" : "self.extracted = code_snippet.split('(', 1)[0].strip()" - } -} \ No newline at end of file diff --git a/transforms/code/code_profiler/python/test-data/expected/local/metadata.json b/transforms/code/code_profiler/python/test-data/expected/local/metadata.json deleted file mode 100644 index d8ab14844..000000000 --- a/transforms/code/code_profiler/python/test-data/expected/local/metadata.json +++ /dev/null @@ -1,48 +0,0 @@ -{ - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "CodeProfiler", - "job type": "pure python", - "job id": "job_id", - "start_time": "2024-11-27 09:36:19", - "end_time": "2024-11-27 09:36:22", - "status": "success" - }, - "code": null, - "job_input_params": { - "checkpointing": false, - "max_files": -1, - "random_samples": -1, - "files_to_use": [ - ".parquet" - ], - "num_processors": 0 - }, - "execution_stats": { - "cpus": 0.1, - "gpus": 0, - "memory": 3.44, - "object_store": 0, - "execution time, min": 0.053 - }, - "job_output_stats": { - "source_files": 1, - "source_size": 37124, - "result_files": 1, - "result_size": 64492, - "processing_time": 0.901, - "source_documents": 5, - "result_documents": 11, - "source_doc_count": 500, - "result_doc_count": 500 - }, - "source": { - "name": "/root/data-prep-kit/transforms/code/code_profiler/input", - "type": "path" - }, - "target": { - "name": "/root/data-prep-kit/transforms/code/code_profiler/output", - "type": "path" - } -} \ No newline at end of file diff --git a/transforms/code/code_profiler/python/test-data/expected/local/uast_table_part_0.parquet b/transforms/code/code_profiler/python/test-data/expected/local/uast_table_part_0.parquet deleted file mode 100644 index 3cc2f0cef..000000000 Binary files a/transforms/code/code_profiler/python/test-data/expected/local/uast_table_part_0.parquet and /dev/null differ diff --git a/transforms/code/code_profiler/python/test-data/expected/python/metadata.json b/transforms/code/code_profiler/python/test-data/expected/python/metadata.json deleted file mode 100644 index d8ab14844..000000000 --- a/transforms/code/code_profiler/python/test-data/expected/python/metadata.json +++ /dev/null @@ -1,48 +0,0 @@ -{ - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "CodeProfiler", - "job type": "pure python", - "job id": "job_id", - "start_time": "2024-11-27 09:36:19", - "end_time": "2024-11-27 09:36:22", - "status": "success" - }, - "code": null, - "job_input_params": { - "checkpointing": false, - "max_files": -1, - "random_samples": -1, - "files_to_use": [ - ".parquet" - ], - "num_processors": 0 - }, - "execution_stats": { - "cpus": 0.1, - "gpus": 0, - "memory": 3.44, - "object_store": 0, - "execution time, min": 0.053 - }, - "job_output_stats": { - "source_files": 1, - "source_size": 37124, - "result_files": 1, - "result_size": 64492, - "processing_time": 0.901, - "source_documents": 5, - "result_documents": 11, - "source_doc_count": 500, - "result_doc_count": 500 - }, - "source": { - "name": "/root/data-prep-kit/transforms/code/code_profiler/input", - "type": "path" - }, - "target": { - "name": "/root/data-prep-kit/transforms/code/code_profiler/output", - "type": "path" - } -} \ No newline at end of file diff --git a/transforms/code/code_profiler/python/test-data/input/data_profiler_params.json b/transforms/code/code_profiler/python/test-data/input/data_profiler_params.json deleted file mode 100644 index 6608d8619..000000000 --- a/transforms/code/code_profiler/python/test-data/input/data_profiler_params.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "input": "multi-package.parquet", - "dynamic_schema_mapping": "True", - "contents": "contents", - "language": "language" -} diff --git a/transforms/code/code_profiler/python/test-data/input/multi-package.parquet b/transforms/code/code_profiler/python/test-data/input/multi-package.parquet deleted file mode 100644 index 6fadd3c04..000000000 Binary files a/transforms/code/code_profiler/python/test-data/input/multi-package.parquet and /dev/null differ diff --git a/transforms/code/code_profiler/ray/.dockerignore b/transforms/code/code_profiler/ray/.dockerignore deleted file mode 100644 index f7275bbbd..000000000 --- a/transforms/code/code_profiler/ray/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -venv/ diff --git a/transforms/code/code_profiler/ray/.gitignore b/transforms/code/code_profiler/ray/.gitignore deleted file mode 100644 index 17cee1df3..000000000 --- a/transforms/code/code_profiler/ray/.gitignore +++ /dev/null @@ -1,37 +0,0 @@ -test-data/output -output/* -/output/ -data-processing-lib/ - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - - -# Distribution / packaging -bin/ -build/ -develop-eggs/ -dist/ -eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -.tox/ -htmlcov -.coverage -.cache -nosetests.xml -coverage.xml \ No newline at end of file diff --git a/transforms/code/code_profiler/ray/Makefile b/transforms/code/code_profiler/ray/Makefile deleted file mode 100644 index c028cdbe9..000000000 --- a/transforms/code/code_profiler/ray/Makefile +++ /dev/null @@ -1,61 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. - -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -# values possible mach-arm64, x86_64 -export RUNTIME_HOST_ARCH=x86_64 - -BASE_IMAGE=${RAY_BASE_IMAGE} -venv:: .transforms.ray-venv - -test:: .transforms.ray-test - -clean:: .transforms.clean - -image:: .transforms.ray-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - -test-image:: .transforms.ray-test-image - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-ray - -setup:: .transforms.setup -# distribution versions is the same as image version. -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(CODE_PROFILER_RAY_VERSION) TOML_VERSION=$(CODE_PROFILER_RAY_VERSION) .transforms.set-versions - -build-dist:: .defaults.build-dist - -publish-dist:: .defaults.publish-dist - -# Ensure RUN_ARGS has a default value -RUN_ARGS ?= "" - -run-cli-sample: .transforms.run-cli-ray-sample - -run-local-sample: .transforms.run-local-ray-sample - -run-s3-sample: .transforms.run-s3-ray-sample - -minio-start: .minio-start - -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -docker-save-image: .defaults.docker-save-image diff --git a/transforms/code/code_profiler/ray/README.md b/transforms/code/code_profiler/ray/README.md deleted file mode 100644 index 08ad591f7..000000000 --- a/transforms/code/code_profiler/ray/README.md +++ /dev/null @@ -1,54 +0,0 @@ -# Code Profiler Transform - - -## Configuration and command line Options - -The set of dictionary keys holding [code_profiler_transform](python/src/code_profiler_transform.py) -configuration for values are as follows: - -* content - specifies the column name in the dataframe that has the code snippet -* language - specifies the programming languages of the code snippet - -## Running - -### Launched Command Line Options -The following command line arguments are available in addition to -the options provided by -the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md). - -### Running the samples - -The code profiler can be run on mach-arm64 and x86_64 host architecture. -Depending on your host architecture, please change the `RUNTIME_HOST_ARCH` in the Makefile. -``` -# values possible mach-arm64, x86_64 -export RUNTIME_HOST_ARCH=x86_64 -``` -If you are using mac, you may need to permit your Mac to load the .so from the security settings. Generally, you get the pop-up under the tab security while running the transform. - -![alt text](image.png) - - -To run the samples, use the following `make` targets - -* `run-local-ray-sample` - runs src/code_profiler_local_ray.py - -These targets will activate the virtual environment and set up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. - -For example, -```shell -make run-local-ray-sample -... -``` -Then -```shell -ls output -``` -To see results of the transform. - -### Transforming data using the transform image - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/code/code_profiler/ray/pyproject.toml b/transforms/code/code_profiler/ray/pyproject.toml deleted file mode 100644 index b330c09ae..000000000 --- a/transforms/code/code_profiler/ray/pyproject.toml +++ /dev/null @@ -1,45 +0,0 @@ -[project] -name = "dpk_code_profiler_transform_ray" -version = "0.2.4.dev0" -requires-python = ">=3.10,<3.13" -description = "Code Profiler Ray Transform" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Pankaj Thorat", email = "pankaj.thorat@ibm.com" }, -] -dependencies = [ - "dpk-code-profiler-transform-python==0.2.4.dev0", - "data-prep-toolkit[ray]>=0.2.4.dev0", - ] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/code/code_profiler/ray/test-data/expected/metadata.json b/transforms/code/code_profiler/ray/test-data/expected/metadata.json deleted file mode 100644 index d8ab14844..000000000 --- a/transforms/code/code_profiler/ray/test-data/expected/metadata.json +++ /dev/null @@ -1,48 +0,0 @@ -{ - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "CodeProfiler", - "job type": "pure python", - "job id": "job_id", - "start_time": "2024-11-27 09:36:19", - "end_time": "2024-11-27 09:36:22", - "status": "success" - }, - "code": null, - "job_input_params": { - "checkpointing": false, - "max_files": -1, - "random_samples": -1, - "files_to_use": [ - ".parquet" - ], - "num_processors": 0 - }, - "execution_stats": { - "cpus": 0.1, - "gpus": 0, - "memory": 3.44, - "object_store": 0, - "execution time, min": 0.053 - }, - "job_output_stats": { - "source_files": 1, - "source_size": 37124, - "result_files": 1, - "result_size": 64492, - "processing_time": 0.901, - "source_documents": 5, - "result_documents": 11, - "source_doc_count": 500, - "result_doc_count": 500 - }, - "source": { - "name": "/root/data-prep-kit/transforms/code/code_profiler/input", - "type": "path" - }, - "target": { - "name": "/root/data-prep-kit/transforms/code/code_profiler/output", - "type": "path" - } -} \ No newline at end of file diff --git a/transforms/code/code_profiler/ray/test-data/expected/multi-package.parquet b/transforms/code/code_profiler/ray/test-data/expected/multi-package.parquet deleted file mode 100644 index 5d0baa494..000000000 Binary files a/transforms/code/code_profiler/ray/test-data/expected/multi-package.parquet and /dev/null differ diff --git a/transforms/code/code_profiler/ray/test-data/input/data_profiler_params.json b/transforms/code/code_profiler/ray/test-data/input/data_profiler_params.json deleted file mode 100644 index 6608d8619..000000000 --- a/transforms/code/code_profiler/ray/test-data/input/data_profiler_params.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "input": "multi-package.parquet", - "dynamic_schema_mapping": "True", - "contents": "contents", - "language": "language" -} diff --git a/transforms/code/code_profiler/ray/test-data/input/multi-package.parquet b/transforms/code/code_profiler/ray/test-data/input/multi-package.parquet deleted file mode 100644 index 6fadd3c04..000000000 Binary files a/transforms/code/code_profiler/ray/test-data/input/multi-package.parquet and /dev/null differ diff --git a/transforms/code/code_profiler/python/requirements.txt b/transforms/code/code_profiler/requirements.txt similarity index 100% rename from transforms/code/code_profiler/python/requirements.txt rename to transforms/code/code_profiler/requirements.txt diff --git a/transforms/code/code_profiler/test-data/expected/metadata.json b/transforms/code/code_profiler/test-data/expected/metadata.json new file mode 100644 index 000000000..406fe3b8a --- /dev/null +++ b/transforms/code/code_profiler/test-data/expected/metadata.json @@ -0,0 +1,46 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "CodeProfiler", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-11-27 09:36:19", + "end_time": "2024-11-27 09:36:22", + "status": "success" + }, + "code": null, + "job_input_params": { + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 0.1, + "gpus": 0, + "memory": 3.44, + "object_store": 0, + "execution time, min": 0.053 + }, + "job_output_stats": { + "source_files": 1, + "source_size": 37124, + "result_files": 1, + "result_size": 64492, + "processing_time": 0.901, + "source_documents": 5, + "result_documents": 11, + "source_doc_count": 500, + "result_doc_count": 500 + }, + "source": { + "name": "/root/data-prep-kit/transforms/code/code_profiler/input", + "type": "path" + }, + "target": { + "name": "/root/data-prep-kit/transforms/code/code_profiler/output", + "type": "path" + } +} diff --git a/transforms/code/code_profiler/python/test-data/expected/python/multi-package.parquet b/transforms/code/code_profiler/test-data/expected/multi-package.parquet similarity index 100% rename from transforms/code/code_profiler/python/test-data/expected/python/multi-package.parquet rename to transforms/code/code_profiler/test-data/expected/multi-package.parquet diff --git a/transforms/code/code_profiler/input/data_profiler_params.json b/transforms/code/code_profiler/test-data/input/data_profiler_params.json similarity index 100% rename from transforms/code/code_profiler/input/data_profiler_params.json rename to transforms/code/code_profiler/test-data/input/data_profiler_params.json diff --git a/transforms/code/code_profiler/input/multi-package.parquet b/transforms/code/code_profiler/test-data/input/multi-package.parquet similarity index 100% rename from transforms/code/code_profiler/input/multi-package.parquet rename to transforms/code/code_profiler/test-data/input/multi-package.parquet diff --git a/transforms/code/code_profiler/python/test/test_code_profiler.py b/transforms/code/code_profiler/test/test_code_profiler.py similarity index 78% rename from transforms/code/code_profiler/python/test/test_code_profiler.py rename to transforms/code/code_profiler/test/test_code_profiler.py index ebd7dcb29..8ba1dcb1f 100644 --- a/transforms/code/code_profiler/python/test/test_code_profiler.py +++ b/transforms/code/code_profiler/test/test_code_profiler.py @@ -11,9 +11,15 @@ ################################################################################ import os -from code_profiler_transform import CodeProfilerTransform, CodeProfilerTransformConfiguration from data_processing.test_support import get_tables_in_folder -from data_processing.test_support.transform.table_transform_test import AbstractTableTransformTest +from data_processing.test_support.transform.table_transform_test import ( + AbstractTableTransformTest, +) +from dpk_code_profiler.transform import ( + CodeProfilerTransform, + CodeProfilerTransformConfiguration, +) + class TestCodeProfilerTransform(AbstractTableTransformTest): """ @@ -27,10 +33,15 @@ def get_test_transform_fixtures(self) -> list[tuple]: expected_dir = os.path.join(src_file_dir, "../test-data/expected/local") input_tables = get_tables_in_folder(input_dir) expected_tables = get_tables_in_folder(expected_dir) - expected_metadata_list = [{'result_documents': 11, 'source_documents': 5}, {}] + expected_metadata_list = [{"result_documents": 11, "source_documents": 5}, {}] config = {"contents": "contents", "language": "language"} fixtures = [ - (CodeProfilerTransform(config), input_tables, expected_tables, expected_metadata_list), + ( + CodeProfilerTransform(config), + input_tables, + expected_tables, + expected_metadata_list, + ), ] return fixtures diff --git a/transforms/code/code_profiler/python/test/test_code_profiler_python.py b/transforms/code/code_profiler/test/test_code_profiler_python.py similarity index 90% rename from transforms/code/code_profiler/python/test/test_code_profiler_python.py rename to transforms/code/code_profiler/test/test_code_profiler_python.py index 35555a074..4cf8995b0 100644 --- a/transforms/code/code_profiler/python/test/test_code_profiler_python.py +++ b/transforms/code/code_profiler/test/test_code_profiler_python.py @@ -12,10 +12,11 @@ import os from data_processing.runtime.pure_python import PythonTransformLauncher -from code_profiler_transform_python import CodeProfilerPythonTransformConfiguration from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) +from dpk_code_profiler.transform_python import CodeProfilerPythonTransformConfiguration + class TestCodeProfilerTransformPython(AbstractTransformLauncherTest): """ @@ -29,9 +30,7 @@ def get_test_transform_fixtures(self) -> list[tuple]: launcher = PythonTransformLauncher(CodeProfilerPythonTransformConfiguration()) input_dir = os.path.join(src_file_dir, "../test-data/input") expected_dir = os.path.join(src_file_dir, "../test-data/expected/python") - transform_config = { - "contents": "contents", - "language": "language"} + transform_config = {"contents": "contents", "language": "language"} fixtures.append( ( diff --git a/transforms/code/code_profiler/ray/test/test_code_profiler_ray.py b/transforms/code/code_profiler/test/test_code_profiler_ray.py similarity index 91% rename from transforms/code/code_profiler/ray/test/test_code_profiler_ray.py rename to transforms/code/code_profiler/test/test_code_profiler_ray.py index 21c1088d2..83b98f9ec 100644 --- a/transforms/code/code_profiler/ray/test/test_code_profiler_ray.py +++ b/transforms/code/code_profiler/test/test_code_profiler_ray.py @@ -11,13 +11,14 @@ ################################################################################ import os - +from data_processing.test_support import get_tables_in_folder from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) from data_processing_ray.runtime.ray import RayTransformLauncher -from code_profiler_transform_ray import CodeProfilerRayTransformConfiguration -from data_processing.test_support import get_tables_in_folder +from dpk_code_profiler.transform_ray import CodeProfilerRayTransformConfiguration + + class TestCodeProfilerTransformRay(AbstractTransformLauncherTest): """ Extends the super-class to define the test data for the tests defined there. @@ -31,9 +32,7 @@ def get_test_transform_fixtures(self) -> list[tuple]: input_dir = os.path.join(src_file_dir, "../test-data/input") expected_dir = os.path.join(src_file_dir, "../test-data/expected") runtime_config = {"run_locally": True} - transform_config = { - "contents": "contents", - "language": "language"} + transform_config = {"contents": "contents", "language": "language"} fixtures.append( ( launcher, diff --git a/transforms/code/code_profiler/transform.config b/transforms/code/code_profiler/transform.config deleted file mode 100644 index 72daedab6..000000000 --- a/transforms/code/code_profiler/transform.config +++ /dev/null @@ -1,18 +0,0 @@ -# -# This is intended to be included across the Makefiles provided within -# a given transform's directory tree, so must use compatible syntax. -# -################################################################################ -# This defines the name of the transform and is used to match against -# expected files and is used to define the transform's image name. -TRANSFORM_NAME=code_profiler - -################################################################################ -# This defines the transforms' version number as would be used -# when publishing the wheel. In general, only the micro version -# number should be advanced relative to the DPK_VERSION. -# -# If you change the versions numbers, be sure to run "make set-versions" to -# update version numbers across the transform (e.g., pyproject.toml). -CODE_PROFILER_PYTHON_VERSION=$(DPK_VERSION) -CODE_PROFILER_RAY_VERSION=$(CODE_PROFILER_PYTHON_VERSION) \ No newline at end of file diff --git a/transforms/code/code_quality/kfp_ray/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py index 6a4ccec1b..a932a4ce1 100644 --- a/transforms/code/code_quality/kfp_ray/code_quality_wf.py +++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py @@ -29,6 +29,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -108,7 +109,14 @@ def code_quality( ray_name: str = "code_quality-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access data_s3_config: str = "{'input_folder': 'test/code_quality/input/', 'output_folder': 'test/code_quality/output/'}", @@ -116,9 +124,13 @@ def code_quality( data_max_files: int = -1, data_num_samples: int = -1, # orchestrator - runtime_actor_options: dict = {'num_cpus': 0.8}, + runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "runtime_pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # code quality parameters cq_contents_column_name: str = "contents", cq_language_column_name: str = "language", @@ -166,7 +178,12 @@ def code_quality( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): diff --git a/transforms/code/code_quality/python/src/code_quality_transform.py b/transforms/code/code_quality/python/src/code_quality_transform.py index 4defb43fe..cd5ac51b4 100644 --- a/transforms/code/code_quality/python/src/code_quality_transform.py +++ b/transforms/code/code_quality/python/src/code_quality_transform.py @@ -205,7 +205,11 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab """ TransformUtils.validate_columns( - table, [self.code_quality["contents_column_name"], self.code_quality["language_column_name"]] + table, + [ + self.code_quality["contents_column_name"], + self.code_quality["language_column_name"], + ], ) line_mean_values = [] diff --git a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py index 9bb315569..58749be7e 100644 --- a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py +++ b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py @@ -29,6 +29,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -106,7 +107,14 @@ def header_cleanser( ray_name: str = "header_cleanser-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access data_s3_config: str = "{'input_folder': 'test/header_cleanser/input/', 'output_folder': 'test/header_cleanser/output/'}", @@ -114,9 +122,13 @@ def header_cleanser( data_max_files: int = -1, data_num_samples: int = -1, # orchestrator - runtime_actor_options: dict = {'num_cpus': 0.8}, + runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "runtime_pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # header cleanser parameters header_cleanser_contents_column_name: str = "contents", header_cleanser_license: bool = True, @@ -162,7 +174,12 @@ def header_cleanser( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): diff --git a/transforms/code/header_cleanser/python/src/header_cleanser_test_support.py b/transforms/code/header_cleanser/python/src/header_cleanser_test_support.py index 8981052b6..b9792c0fb 100644 --- a/transforms/code/header_cleanser/python/src/header_cleanser_test_support.py +++ b/transforms/code/header_cleanser/python/src/header_cleanser_test_support.py @@ -37,7 +37,10 @@ def _get_launcher(self) -> (AbstractTransformLauncher, dict): Returns: the launcher and any additional command line/configuration included in the list of args given as the 2nd element of the fixtures. """ - return (PythonTransformLauncher(HeaderCleanserPythonTransformConfiguration()), {}) + return ( + PythonTransformLauncher(HeaderCleanserPythonTransformConfiguration()), + {}, + ) def _get_test_file_directory(self) -> str: raise NotImplemented diff --git a/transforms/code/header_cleanser/python/src/header_cleanser_transform.py b/transforms/code/header_cleanser/python/src/header_cleanser_transform.py index 00fa3c892..58eb72f10 100644 --- a/transforms/code/header_cleanser/python/src/header_cleanser_transform.py +++ b/transforms/code/header_cleanser/python/src/header_cleanser_transform.py @@ -194,7 +194,11 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab updated_content = pa.array(updated_content) - table = table.set_column(table.column_names.index(self.column_name), self.column_name, updated_content) + table = table.set_column( + table.column_names.index(self.column_name), + self.column_name, + updated_content, + ) return [table], {"Removed code count": remove_code_count} diff --git a/transforms/code/header_cleanser/python/test/test_header_cleanser.py b/transforms/code/header_cleanser/python/test/test_header_cleanser.py index 27d6b84a0..36144fc9a 100644 --- a/transforms/code/header_cleanser/python/test/test_header_cleanser.py +++ b/transforms/code/header_cleanser/python/test/test_header_cleanser.py @@ -49,7 +49,12 @@ def create_header_cleanser_test_fixture( with open(os.path.join(expected_output_dir, "metadata.json"), "r") as meta_file: expected_metadata = json.load(meta_file) expected_metadata_list = [expected_metadata, {}] - return HeaderCleanserTransform(config), [input_df], [expected_output_df], expected_metadata_list + return ( + HeaderCleanserTransform(config), + [input_df], + [expected_output_df], + expected_metadata_list, + ) def get_test_transform_fixtures(self) -> list[tuple]: fixtures = [] diff --git a/transforms/code/header_cleanser/ray/test/test_header_cleanser_ray.py b/transforms/code/header_cleanser/ray/test/test_header_cleanser_ray.py index c3286ad1b..2c63f3067 100644 --- a/transforms/code/header_cleanser/ray/test/test_header_cleanser_ray.py +++ b/transforms/code/header_cleanser/ray/test/test_header_cleanser_ray.py @@ -29,4 +29,7 @@ def _get_test_file_directory(self) -> str: return dir def _get_launcher(self) -> (AbstractTransformLauncher, dict): - return (RayTransformLauncher(HeaderCleanserRayTransformConfiguration()), {"run_locally": True}) + return ( + RayTransformLauncher(HeaderCleanserRayTransformConfiguration()), + {"run_locally": True}, + ) diff --git a/transforms/code/license_select/kfp_ray/license_select_wf.py b/transforms/code/license_select/kfp_ray/license_select_wf.py index 7dba0d9d1..86c4d9074 100644 --- a/transforms/code/license_select/kfp_ray/license_select_wf.py +++ b/transforms/code/license_select/kfp_ray/license_select_wf.py @@ -30,6 +30,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -119,7 +120,11 @@ def license_select( # orchestrator runtime_actor_options: dict = {"num_cpus": 0.7}, runtime_pipeline_id: str = "runtime_pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # license select parameters lc_license_column_name: str = "license", lc_licenses_file: str = "test/license_select/sample_approved_licenses.json", diff --git a/transforms/code/license_select/python/src/license_select_transform.py b/transforms/code/license_select/python/src/license_select_transform.py index a43d399a3..3acc28b30 100644 --- a/transforms/code/license_select/python/src/license_select_transform.py +++ b/transforms/code/license_select/python/src/license_select_transform.py @@ -48,6 +48,7 @@ LICENSE_COLUMN_DEFAULT = "license" LICENSES_KEY = "licenses" + def _get_supported_licenses(license_file: str, data_access: DataAccess) -> list[str]: logger.info(f"Getting supported licenses from file {license_file}") licenses_list = None @@ -119,6 +120,7 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab new_table = self.transformer.transform(table) return [new_table], {} + class LicenseSelectTransformConfiguration(TransformConfiguration): def __init__(self): super().__init__(name="license_select", transform_class=LicenseSelectTransform) @@ -159,13 +161,13 @@ def add_input_params(self, parser: ArgumentParser) -> None: def apply_input_params(self, args: Namespace) -> bool: if not self.daf.apply_input_params(args): return False - + captured = CLIArgumentProvider.capture_parameters(args, CLI_PREFIX, False) license_column_name = captured.get(LICENSE_COLUMN_NAME_KEY) allow_licenses = captured.get(ALLOW_NO_LICENSE_KEY) deny_licenses = captured.get(DENY_LICENSES_KEY, False) licenses_file = captured.get(LICENSES_FILE_KEY) - + # Read licenses from allow-list or deny-list data_access = self.daf.create_data_access() licenses = _get_supported_licenses(licenses_file, data_access) diff --git a/transforms/code/malware/kfp_ray/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py index bede80b88..fb033ef47 100644 --- a/transforms/code/malware/kfp_ray/malware_wf.py +++ b/transforms/code/malware/kfp_ray/malware_wf.py @@ -29,6 +29,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -100,7 +101,14 @@ def malware( ray_name: str = "malware-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access data_s3_config: str = "{'input_folder': 'test/malware/input', 'output_folder': 'test/malware/output'}", @@ -108,9 +116,13 @@ def malware( data_max_files: int = -1, data_num_samples: int = -1, # orchestrator - runtime_actor_options: dict = {'num_cpus': 0.8}, + runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # mallware malware_input_column: str = "contents", malware_output_column: str = "virus_detection", @@ -155,7 +167,12 @@ def malware( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): diff --git a/transforms/code/malware/python/src/malware_transform.py b/transforms/code/malware/python/src/malware_transform.py index a6312e064..c8b334f53 100644 --- a/transforms/code/malware/python/src/malware_transform.py +++ b/transforms/code/malware/python/src/malware_transform.py @@ -147,7 +147,6 @@ def _scan(content: str) -> str | None: class MalwareTransformConfiguration(TransformConfiguration): - """ Provides support for configuring and using the associated Transform class include configuration with CLI args and combining of metadata. diff --git a/transforms/code/malware/python/test/test_malware.py b/transforms/code/malware/python/test/test_malware.py index d0bf135c8..81525d7f2 100644 --- a/transforms/code/malware/python/test/test_malware.py +++ b/transforms/code/malware/python/test/test_malware.py @@ -22,17 +22,26 @@ table = pa.Table.from_pydict( { "document_id": ["ID_1", "ID_2"], - "contents": ["INNOCENT", "X5O!P%@AP[4\\PZX54(P^)7CC)7}$EICAR-STANDARD-ANTIVIRUS-TEST-FILE!$H+H*"], + "contents": [ + "INNOCENT", + "X5O!P%@AP[4\\PZX54(P^)7CC)7}$EICAR-STANDARD-ANTIVIRUS-TEST-FILE!$H+H*", + ], } ) expected_table = pa.Table.from_pydict( { "document_id": ["ID_1", "ID_2"], - "contents": ["INNOCENT", "X5O!P%@AP[4\\PZX54(P^)7CC)7}$EICAR-STANDARD-ANTIVIRUS-TEST-FILE!$H+H*"], + "contents": [ + "INNOCENT", + "X5O!P%@AP[4\\PZX54(P^)7CC)7}$EICAR-STANDARD-ANTIVIRUS-TEST-FILE!$H+H*", + ], "virus_detection": [None, "Win.Test.EICAR_HDB-1"], } ) -expected_metadata_list = [{"clean": 1, "infected": 1}, {}] # transform() result # flush() result +expected_metadata_list = [ + {"clean": 1, "infected": 1}, + {}, +] # transform() result # flush() result class TestMalwareTransform(AbstractTableTransformTest): @@ -44,7 +53,12 @@ class TestMalwareTransform(AbstractTableTransformTest): def get_test_transform_fixtures(self) -> list[Tuple]: fixtures = [ ( - MalwareTransform({"malware_input_column": "contents", "malware_output_column": "virus_detection"}), + MalwareTransform( + { + "malware_input_column": "contents", + "malware_output_column": "virus_detection", + } + ), [table], [expected_table], expected_metadata_list, diff --git a/transforms/code/malware/ray/test/test_malware_ray.py b/transforms/code/malware/ray/test/test_malware_ray.py index 733e0565d..b87a850c5 100644 --- a/transforms/code/malware/ray/test/test_malware_ray.py +++ b/transforms/code/malware/ray/test/test_malware_ray.py @@ -35,7 +35,11 @@ def get_test_transform_fixtures(self) -> list[tuple]: fixtures = [ ( RayTransformLauncher(MalwareRayTransformConfiguration()), - {"run_locally": True, INPUT_COLUMN_KEY: "contents", OUTPUT_COLUMN_KEY: "virus_detection"}, + { + "run_locally": True, + INPUT_COLUMN_KEY: "contents", + OUTPUT_COLUMN_KEY: "virus_detection", + }, basedir + "/input", basedir + "/expected", ) diff --git a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py index 11f001bfa..fc6649999 100644 --- a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py @@ -102,7 +102,14 @@ def lang_select( ray_name: str = "proglang-match-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access data_s3_config: str = "{'input_folder': 'test/proglang_select/input/', 'output_folder': 'test/proglang_select/output/'}", @@ -110,9 +117,13 @@ def lang_select( data_max_files: int = -1, data_num_samples: int = -1, # orchestrator - runtime_actor_options: dict = {'num_cpus': 0.8}, + runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # Proglang match parameters proglang_select_allowed_langs_file: str = "test/proglang_select/languages/allowed-code-languages.txt", proglang_select_language_column: str = "language", @@ -160,7 +171,12 @@ def lang_select( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): diff --git a/transforms/code/proglang_select/ray/src/proglang_select_transform_ray.py b/transforms/code/proglang_select/ray/src/proglang_select_transform_ray.py index 139565880..6177330e1 100644 --- a/transforms/code/proglang_select/ray/src/proglang_select_transform_ray.py +++ b/transforms/code/proglang_select/ray/src/proglang_select_transform_ray.py @@ -80,7 +80,10 @@ def get_transform_config( class ProgLangSelectRayConfiguration(RayTransformRuntimeConfiguration): def __init__(self): - super().__init__(transform_config=ProgLangSelectTransformConfiguration(), runtime_class=ProgLangSelectRuntime) + super().__init__( + transform_config=ProgLangSelectTransformConfiguration(), + runtime_class=ProgLangSelectRuntime, + ) if __name__ == "__main__": diff --git a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py index 38a829fab..6db705816 100644 --- a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py +++ b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py @@ -29,6 +29,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -135,7 +136,11 @@ def repo_level_order( # orchestrator runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # repo_level_order parameters repo_lvl_stage_one_only: bool = False, repo_lvl_grouping_column: str = "repo_name", @@ -197,7 +202,10 @@ def repo_level_order( """ # create clean_up task clean_up_task = cleanup_ray_op( - ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition diff --git a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/repo_level_wrappers.py b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/repo_level_wrappers.py index 1e9a24993..19d10177a 100644 --- a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/repo_level_wrappers.py +++ b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/repo_level_wrappers.py @@ -34,30 +34,48 @@ def semantic_sort( - df: pd.DataFrame, logger: logging.Logger, title_column_name: str, language_column_name: str + df: pd.DataFrame, + logger: logging.Logger, + title_column_name: str, + language_column_name: str, ) -> pd.DataFrame: return sort_sem( - files_df=df, logger=logger, title_column_name=title_column_name, language_column_name=language_column_name + files_df=df, + logger=logger, + title_column_name=title_column_name, + language_column_name=language_column_name, ) def semantic_sort_normalised( - df: pd.DataFrame, logger: logging.Logger, title_column_name: str, language_column_name: str + df: pd.DataFrame, + logger: logging.Logger, + title_column_name: str, + language_column_name: str, ) -> pd.DataFrame: check_and_update_title(df) return sort_sem( - files_df=df, logger=logger, title_column_name=title_column_name, language_column_name=language_column_name + files_df=df, + logger=logger, + title_column_name=title_column_name, + language_column_name=language_column_name, ) def default_sort( - df: pd.DataFrame, logger: logging.Logger, title_column_name: str, language_column_name: str + df: pd.DataFrame, + logger: logging.Logger, + title_column_name: str, + language_column_name: str, ) -> pd.DataFrame: return sort_by_path(df=df, logger=logger, title_column_name=title_column_name) def get_sorting_func( - sorting_algo: str, title_column_name: str, logger: logging.Logger, language_column_name: str + sorting_algo: str, + title_column_name: str, + logger: logging.Logger, + language_column_name: str, ) -> Callable[[pa.Table], pa.Table]: """Get a sorting function based on the specified algorithm. @@ -91,7 +109,10 @@ def sorter(table: pa.Table, file_name: str) -> pa.Table: df = table.to_pandas() try: sorted_df = sort_by( - df=df, logger=logger, title_column_name=title_column_name, language_column_name=language_column_name + df=df, + logger=logger, + title_column_name=title_column_name, + language_column_name=language_column_name, ) except FunctionTimedOut as e: logger.error( @@ -189,7 +210,12 @@ def lang_distribution(grouping_column): return new_table -def get_transforming_func(sorting_func=None, superrows_func=None, filename_func=None, language_column_name="language"): +def get_transforming_func( + sorting_func=None, + superrows_func=None, + filename_func=None, + language_column_name="language", +): """ This function takes three optional functions as input and returns a function that can be applied to a pyarrow table and file name. diff --git a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/build_dep_graph.py b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/build_dep_graph.py index a655589ca..78fa57c96 100644 --- a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/build_dep_graph.py +++ b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/build_dep_graph.py @@ -434,7 +434,12 @@ def add_files_info_to_analysis_obj(analysis, files_list): return analysis -def build_edges(files_df_org, logger: Logger, title_column_name="new_title", language_column_name="language"): +def build_edges( + files_df_org, + logger: Logger, + title_column_name="new_title", + language_column_name="language", +): full_repo_name = files_df_org.repo_name.to_list()[0] analysis = get_analysis_obj(full_repo_name) files_df = add_repo_path(files_df_org, analysis, title_column_name) diff --git a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/sort_by_semantic_dep.py b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/sort_by_semantic_dep.py index 4c309dd84..c81fe097c 100644 --- a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/sort_by_semantic_dep.py +++ b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/sort_by_semantic_dep.py @@ -61,7 +61,12 @@ def check_and_update_title(df: pd.DataFrame, title_column="title"): @func_set_timeout(1800) -def sort_sem(files_df: pd.DataFrame, logger: Logger, title_column_name="new_title", language_column_name="language"): +def sort_sem( + files_df: pd.DataFrame, + logger: Logger, + title_column_name="new_title", + language_column_name="language", +): received_shape = files_df.shape supported_bools = files_df.ext.isin(supported_exts) diff --git a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/topological_sort.py b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/topological_sort.py index 064b56923..9de614b8d 100644 --- a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/topological_sort.py +++ b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/topological_sort.py @@ -135,7 +135,10 @@ def topological_sort_on_df(input_graph, df, logger, title_column_name="new_title logger.debug("Found zero nodes without dependency") else: sorted_df_without_dep1, sorted_df_without_dep2 = sort_by_path( - df_without_dep, logger, split_by_filetype=True, title_column_name=title_column_name + df_without_dep, + logger, + split_by_filetype=True, + title_column_name=title_column_name, ) if logger: diff --git a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/store/store_factory.py b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/store/store_factory.py index 004beba33..d25040a4e 100644 --- a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/store/store_factory.py +++ b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/store/store_factory.py @@ -60,7 +60,11 @@ def validate_store_params(store_params): raise ValueError(f"{store_backend_dir_key} not set for {store_type}") if store_type is store_type_value_s3: # s3 creds are required - for required in [store_s3_keyid_key, store_s3_endpoint_key, store_s3_secret_key]: + for required in [ + store_s3_keyid_key, + store_s3_endpoint_key, + store_s3_secret_key, + ]: if required not in store_params.keys(): raise ValueError(f"Required key: {required} not set for {store_type}") if store_type in [store_type_value_ray]: diff --git a/transforms/code/repo_level_ordering/ray/src/repo_level_order_transform.py b/transforms/code/repo_level_ordering/ray/src/repo_level_order_transform.py index a43feda87..d9c51c8d1 100644 --- a/transforms/code/repo_level_ordering/ray/src/repo_level_order_transform.py +++ b/transforms/code/repo_level_ordering/ray/src/repo_level_order_transform.py @@ -279,7 +279,8 @@ def _prepare_mapper_function(self): self.logger.info("Output by language enabled.") mapper_function_params = mapper_function_params | { "filename_func": get_dominant_language_func( - language_column_name=self.language_column_name, title_column_name="title" + language_column_name=self.language_column_name, + title_column_name="title", ), } diff --git a/transforms/code/repo_level_ordering/ray/test/test_internal_wrappers.py b/transforms/code/repo_level_ordering/ray/test/test_internal_wrappers.py index c9b54d449..024859cfd 100644 --- a/transforms/code/repo_level_ordering/ray/test/test_internal_wrappers.py +++ b/transforms/code/repo_level_ordering/ray/test/test_internal_wrappers.py @@ -60,7 +60,10 @@ def test_sort_by_path(): } ) sort_by_path_ = get_sorting_func( - sorting_algo="SORT_BY_PATH", title_column_name="title", logger=logger, language_column_name="language" + sorting_algo="SORT_BY_PATH", + title_column_name="title", + logger=logger, + language_column_name="language", ) table = sort_by_path_(input_table, "file") compare_tables(table, expected_table) @@ -88,7 +91,10 @@ def test_semantic_sort_reverting_to_default(): } ) sort_by = get_sorting_func( - sorting_algo="SORT_SEMANTIC", title_column_name="title", logger=logger, language_column_name="language" + sorting_algo="SORT_SEMANTIC", + title_column_name="title", + logger=logger, + language_column_name="language", ) table = sort_by(input_table, "file") compare_tables(table, expected_table) @@ -108,7 +114,10 @@ def test_skip_sorting(): } ) sort_by = get_sorting_func( - sorting_algo="SORT_BY_PATH", title_column_name="title", logger=logger, language_column_name="language" + sorting_algo="SORT_BY_PATH", + title_column_name="title", + logger=logger, + language_column_name="language", ) table = sort_by(input_table, "file") compare_tables(table, expected_table) diff --git a/transforms/language/doc_chunk/dpk_doc_chunk/chunkers.py b/transforms/language/doc_chunk/dpk_doc_chunk/chunkers.py index b55bd08ff..2d6cad882 100644 --- a/transforms/language/doc_chunk/dpk_doc_chunk/chunkers.py +++ b/transforms/language/doc_chunk/dpk_doc_chunk/chunkers.py @@ -11,13 +11,13 @@ ################################################################################ from abc import ABCMeta, abstractmethod -from typing import Iterator, Optional, Dict, List +from typing import Dict, Iterator, List, Optional +from docling_core.transforms.chunker import DocMeta, HierarchicalChunker from docling_core.types.doc import DoclingDocument -from llama_index.core.node_parser.text.token import TokenTextSplitter from llama_index.core import Document as LIDocument from llama_index.core.node_parser import MarkdownNodeParser -from docling_core.transforms.chunker import HierarchicalChunker, DocMeta +from llama_index.core.node_parser.text.token import TokenTextSplitter class ChunkingExecutor(metaclass=ABCMeta): @@ -70,11 +70,11 @@ def chunk(self, content: str) -> Iterator[dict]: class LITokenTextSplitter(ChunkingExecutor): """ - A text chunker that leverages Llama Index's token-based text splitter. This splitter breaks input text into - fixed-window chunks, with each chunk measured in tokens rather than characters. + A text chunker that leverages Llama Index's token-based text splitter. This splitter breaks input text into + fixed-window chunks, with each chunk measured in tokens rather than characters. - The chunking process ensures that each chunk contains a specific number of tokens, and an optional overlap between - chunks (also measured in tokens) can be specified to preserve context between the chunks. + The chunking process ensures that each chunk contains a specific number of tokens, and an optional overlap between + chunks (also measured in tokens) can be specified to preserve context between the chunks. Args: output_chunk_column_name (str): Name of the output column containing the text of each chunk. @@ -93,15 +93,14 @@ def __init__( self, output_chunk_column_name: str, output_chunk_column_id: str, - chunk_size_tokens: int, - chunk_overlap_tokens: int + chunk_size_tokens: int, + chunk_overlap_tokens: int, ): self.output_chunk_column_name = output_chunk_column_name self.output_chunk_column_id = output_chunk_column_id self.chunk_size = chunk_size_tokens self.chunk_overlap = chunk_overlap_tokens - def _chunk_text(self, text: str) -> List[str]: """ Internal method to chunk text using TokenTextSplitter. @@ -112,13 +111,9 @@ def _chunk_text(self, text: str) -> List[str]: Returns: List[str]: List of chunked text. """ - text_splitter = TokenTextSplitter( - chunk_size=self.chunk_size, - chunk_overlap=self.chunk_overlap - ) + text_splitter = TokenTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap) return text_splitter.split_text(text) - def chunk(self, text: str) -> Iterator[Dict]: """ Chunks input text into fixed-window lengths with token overlap. @@ -135,4 +130,4 @@ def chunk(self, text: str) -> Iterator[Dict]: self.output_chunk_column_id: chunk_id, self.output_chunk_column_name: chunk, } - chunk_id += 1 \ No newline at end of file + chunk_id += 1 diff --git a/transforms/language/doc_chunk/dpk_doc_chunk/local_python.py b/transforms/language/doc_chunk/dpk_doc_chunk/local_python.py index 51fd4de50..4923ae192 100644 --- a/transforms/language/doc_chunk/dpk_doc_chunk/local_python.py +++ b/transforms/language/doc_chunk/dpk_doc_chunk/local_python.py @@ -16,8 +16,9 @@ from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils -from dpk_doc_chunk.transform_python import DocChunkPythonTransformConfiguration from dpk_doc_chunk.transform import chunking_types +from dpk_doc_chunk.transform_python import DocChunkPythonTransformConfiguration + # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) @@ -41,7 +42,7 @@ # "doc_chunk_dl_min_chunk_len": 10, # for testing the usage of the deprecated argument # "doc_chunk_chunking_type": "li_markdown", "doc_chunk_chunking_type": "dl_json", - # "doc_chunk_chunking_type": chunking_types.LI_TOKEN_TEXT, + # "doc_chunk_chunking_type": chunking_types.LI_TOKEN_TEXT, # fixed-size params # "doc_chunk_output_chunk_column_name": "chunk_text", # "doc_chunk_chunk_size_tokens": 128, diff --git a/transforms/language/doc_chunk/dpk_doc_chunk/transform.py b/transforms/language/doc_chunk/dpk_doc_chunk/transform.py index 55c287cc8..8c1c4e2dd 100644 --- a/transforms/language/doc_chunk/dpk_doc_chunk/transform.py +++ b/transforms/language/doc_chunk/dpk_doc_chunk/transform.py @@ -17,7 +17,12 @@ import pyarrow as pa from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger -from dpk_doc_chunk.chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown, LITokenTextSplitter +from dpk_doc_chunk.chunkers import ( + ChunkingExecutor, + DLJsonChunker, + LIMarkdown, + LITokenTextSplitter, +) short_name = "doc_chunk" @@ -44,6 +49,7 @@ chunk_size_tokens_cli_param = f"{cli_prefix}{chunk_size_tokens_key}" chunk_overlap_tokens_cli_param = f"{cli_prefix}{chunk_overlap_tokens_key}" + class chunking_types(str, enum.Enum): LI_MARKDOWN = "li_markdown" DL_JSON = "dl_json" @@ -65,6 +71,7 @@ def __str__(self): default_chunk_size_tokens = 128 default_chunk_overlap_tokens = 30 + class DocChunkTransform(AbstractTableTransform): """ Implements a simple copy of a pyarrow Table. @@ -88,7 +95,10 @@ def __init__(self, config: dict[str, Any]): self.doc_id_column_name = config.get(doc_id_column_name_key, default_doc_id_column_name) self.output_chunk_column_name = config.get(output_chunk_column_name_key, default_output_chunk_column_name) self.output_chunk_column_id = config.get(output_chunk_column_id_key, default_output_chunk_column_id) - self.output_source_doc_id_column_name = config.get(output_source_doc_id_column_name_key, default_output_source_doc_id_column_name) + self.output_source_doc_id_column_name = config.get( + output_source_doc_id_column_name_key, + default_output_source_doc_id_column_name, + ) # Parameters for Docling JSON chunking self.output_jsonpath_column_name = config.get( @@ -99,7 +109,7 @@ def __init__(self, config: dict[str, Any]): ) self.output_bbox_column_name_key = config.get(output_bbox_column_name_key, default_output_bbox_column_name) - # Parameters for Fixed-size with overlap chunking + # Parameters for Fixed-size with overlap chunking self.chunk_size_tokens = config.get(chunk_size_tokens_key, default_chunk_size_tokens) self.chunk_overlap_tokens = config.get(chunk_overlap_tokens_key, default_chunk_overlap_tokens) @@ -122,7 +132,7 @@ def __init__(self, config: dict[str, Any]): output_chunk_column_name=self.output_chunk_column_name, output_chunk_column_id=self.output_chunk_column_id, chunk_size_tokens=self.chunk_size_tokens, - chunk_overlap_tokens=self.chunk_overlap_tokens + chunk_overlap_tokens=self.chunk_overlap_tokens, ) else: raise RuntimeError(f"{self.chunking_type=} is not valid.") @@ -138,7 +148,9 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab for batch in table.to_batches(): for row in batch.to_pylist(): content: str = row[self.content_column_name] - new_row = {k: v for k, v in row.items() if k not in (self.content_column_name, self.doc_id_column_name)} + new_row = { + k: v for k, v in row.items() if k not in (self.content_column_name, self.doc_id_column_name) + } if self.doc_id_column_name in row: new_row[self.output_source_doc_id_column_name] = row[self.doc_id_column_name] for chunk in self.chunker.chunk(content): @@ -159,7 +171,6 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab class DocChunkTransformConfiguration(TransformConfiguration): - """ Provides support for configuring and using the associated Transform class include configuration with CLI args. @@ -249,6 +260,8 @@ def apply_input_params(self, args: Namespace) -> bool: self.params = self.params | captured if self.params.get("dl_min_chunk_len") is not None: - self.logger.warning("The `dl_min_chunk_len` option is deprecated and will be ignored. Please stop using it, it will not accepted anymore in future versions.") + self.logger.warning( + "The `dl_min_chunk_len` option is deprecated and will be ignored. Please stop using it, it will not accepted anymore in future versions." + ) self.logger.info(f"doc_chunk parameters are : {self.params}") return True diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py index 63be2c878..618c2b5d9 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py @@ -28,6 +28,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -121,7 +122,11 @@ def doc_chunk( # orchestrator runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # doc_chunk parameters doc_chunk_chunking_type: str = "dl_json", doc_chunk_content_column_name: str = "contents", @@ -171,7 +176,10 @@ def doc_chunk( """ # create clean_up task clean_up_task = cleanup_ray_op( - ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py index d8b0a49af..492fe4841 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py @@ -28,6 +28,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -105,7 +106,14 @@ def doc_chunk( ray_name: str = "doc-json-chunk-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access data_s3_config: str = "{'input_folder': 'test/doc_chunk/input/', 'output_folder': 'test/doc_chunk/output/'}", @@ -113,9 +121,13 @@ def doc_chunk( data_max_files: int = -1, data_num_samples: int = -1, # orchestrator - runtime_actor_options: dict = {'num_cpus': 0.8}, + runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # doc_chunk parameters doc_chunk_chunking_type: str = "dl_json", doc_chunk_content_column_name: str = "contents", @@ -164,7 +176,12 @@ def doc_chunk( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): diff --git a/transforms/language/doc_chunk/test/test_doc_chunk_python.py b/transforms/language/doc_chunk/test/test_doc_chunk_python.py index 9f9c9b796..0d34d8e65 100644 --- a/transforms/language/doc_chunk/test/test_doc_chunk_python.py +++ b/transforms/language/doc_chunk/test/test_doc_chunk_python.py @@ -17,9 +17,9 @@ AbstractTransformLauncherTest, ) from dpk_doc_chunk.transform import ( - chunking_type_cli_param, + chunking_type_cli_param, + chunking_types, output_chunk_column_name_cli_param, - chunking_types ) from dpk_doc_chunk.transform_python import DocChunkPythonTransformConfiguration @@ -66,7 +66,7 @@ def get_test_transform_fixtures(self) -> list[tuple]: launcher, { chunking_type_cli_param: chunking_types.LI_TOKEN_TEXT, - output_chunk_column_name_cli_param: "chunk_text" + output_chunk_column_name_cli_param: "chunk_text", }, basedir + "/input_token_text", basedir + "/expected_token_text", diff --git a/transforms/language/doc_quality/dpk_doc_quality/transform.py b/transforms/language/doc_quality/dpk_doc_quality/transform.py index 6bc1b359b..3d1876e87 100644 --- a/transforms/language/doc_quality/dpk_doc_quality/transform.py +++ b/transforms/language/doc_quality/dpk_doc_quality/transform.py @@ -126,7 +126,12 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab docq_sentence_count.append(c4_sentence_count(text, ft_lang=self.text_lang)) docq_lorem_ipsum_ratio.append( - c4_contain_pattern_ratio(text, pattern="lorem ipsum", ft_lang=self.text_lang, normalize_text=True) + c4_contain_pattern_ratio( + text, + pattern="lorem ipsum", + ft_lang=self.text_lang, + normalize_text=True, + ) ) curly_bracket_ratio = 0.0 for sign in ["{", "}"]: @@ -154,23 +159,33 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab table = TransformUtils.add_column(table=table, name="docq_total_words", content=docq_total_words) table = TransformUtils.add_column(table=table, name="docq_mean_word_len", content=docq_mean_word_len) table = TransformUtils.add_column( - table=table, name="docq_symbol_to_word_ratio", content=docq_symbol_to_word_ratio + table=table, + name="docq_symbol_to_word_ratio", + content=docq_symbol_to_word_ratio, ) table = TransformUtils.add_column(table=table, name="docq_sentence_count", content=docq_sentence_count) table = TransformUtils.add_column(table=table, name="docq_lorem_ipsum_ratio", content=docq_lorem_ipsum_ratio) table = TransformUtils.add_column( - table=table, name="docq_curly_bracket_ratio", content=docq_curly_bracket_ratio + table=table, + name="docq_curly_bracket_ratio", + content=docq_curly_bracket_ratio, ) table = TransformUtils.add_column(table=table, name="docq_contain_bad_word", content=docq_contain_bad_word) table = TransformUtils.add_column(table=table, name="docq_bullet_point_ratio", content=docq_bullet_point_ratio) table = TransformUtils.add_column( - table=table, name="docq_ellipsis_line_ratio", content=docq_ellipsis_line_ratio + table=table, + name="docq_ellipsis_line_ratio", + content=docq_ellipsis_line_ratio, ) table = TransformUtils.add_column( - table=table, name="docq_alphabet_word_ratio", content=docq_alphabet_word_ratio + table=table, + name="docq_alphabet_word_ratio", + content=docq_alphabet_word_ratio, ) table = TransformUtils.add_column( - table=table, name="docq_contain_common_en_words", content=docq_contain_common_en_words + table=table, + name="docq_contain_common_en_words", + content=docq_contain_common_en_words, ) if self.text_lang == "ja": @@ -185,7 +200,6 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab class DocQualityTransformConfiguration(TransformConfiguration): - """ Provides support for configuring and using the associated Transform class include configuration with CLI args. @@ -207,7 +221,9 @@ def add_input_params(self, parser: ArgumentParser) -> None: (e.g, noop_, pii_, etc.) """ parser.add_argument( - f"--{text_lang_cli_param}", default=default_text_lang, help="language used in the text content" + f"--{text_lang_cli_param}", + default=default_text_lang, + help="language used in the text content", ) parser.add_argument( f"--{doc_content_column_cli_param}", diff --git a/transforms/language/doc_quality/dpk_doc_quality/transform_python.py b/transforms/language/doc_quality/dpk_doc_quality/transform_python.py index 688c56a82..15301faf4 100644 --- a/transforms/language/doc_quality/dpk_doc_quality/transform_python.py +++ b/transforms/language/doc_quality/dpk_doc_quality/transform_python.py @@ -65,7 +65,11 @@ def __init__(self, **kwargs): self.params[text_lang_cli_param] = "en" if bad_word_filepath_cli_param not in self.params: self.params[bad_word_filepath_cli_param] = os.path.abspath( - os.path.join(os.path.dirname(__file__), "ldnoobw", self.params[text_lang_cli_param]) + os.path.join( + os.path.dirname(__file__), + "ldnoobw", + self.params[text_lang_cli_param], + ) ) def transform(self): diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py index 3890cd64e..f2d9b9a53 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py @@ -28,6 +28,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -101,7 +102,12 @@ def compute_exec_params_func( def doc_quality( # Ray cluster ray_name: str = "doc_quality-kfp-ray", # name of Ray cluster - ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image, "image_pull_policy": "Always"}, + ray_head_options: dict = { + "cpu": 1, + "memory": 4, + "image": task_image, + "image_pull_policy": "Always", + }, ray_worker_options: dict = { "replicas": 2, "max_replicas": 2, @@ -120,7 +126,11 @@ def doc_quality( # orchestrator runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # doc_quality parameters docq_text_lang: str = "en", docq_doc_content_column: str = "contents", @@ -168,7 +178,10 @@ def doc_quality( """ # create clean_up task clean_up_task = cleanup_ray_op( - ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py index be7a09577..ef51e82f3 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py @@ -126,7 +126,11 @@ def doc_quality( # orchestrator runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # doc_quality parameters docq_text_lang: str = "en", docq_doc_content_column: str = "contents", @@ -174,7 +178,10 @@ def doc_quality( """ # create clean_up task clean_up_task = cleanup_ray_op( - ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition diff --git a/transforms/language/doc_quality/test/test_doc_quality_python.py b/transforms/language/doc_quality/test/test_doc_quality_python.py index dd2557818..adb964908 100644 --- a/transforms/language/doc_quality/test/test_doc_quality_python.py +++ b/transforms/language/doc_quality/test/test_doc_quality_python.py @@ -36,5 +36,12 @@ def get_test_transform_fixtures(self) -> list[tuple]: basedir = os.path.abspath(os.path.join(basedir, "test-data")) fixtures = [] launcher = PythonTransformLauncher(DocQualityPythonTransformConfiguration()) - fixtures.append((launcher, cli_params, os.path.join(basedir, "input"), os.path.join(basedir, "expected"))) + fixtures.append( + ( + launcher, + cli_params, + os.path.join(basedir, "input"), + os.path.join(basedir, "expected"), + ) + ) return fixtures diff --git a/transforms/language/doc_quality/test/test_doc_quality_ray.py b/transforms/language/doc_quality/test/test_doc_quality_ray.py index fcfeb0b4e..add189252 100644 --- a/transforms/language/doc_quality/test/test_doc_quality_ray.py +++ b/transforms/language/doc_quality/test/test_doc_quality_ray.py @@ -35,5 +35,12 @@ def get_test_transform_fixtures(self) -> list[tuple]: basedir = os.path.abspath(os.path.join(basedir, "test-data")) fixtures = [] launcher = RayTransformLauncher(DocQualityRayTransformConfiguration()) - fixtures.append((launcher, cli_params, os.path.join(basedir, "input"), os.path.join(basedir, "expected"))) + fixtures.append( + ( + launcher, + cli_params, + os.path.join(basedir, "input"), + os.path.join(basedir, "expected"), + ) + ) return fixtures diff --git a/transforms/language/html2parquet/dpk_html2parquet/__init__.py b/transforms/language/html2parquet/dpk_html2parquet/__init__.py index 84990d26f..0bedd041c 100644 --- a/transforms/language/html2parquet/dpk_html2parquet/__init__.py +++ b/transforms/language/html2parquet/dpk_html2parquet/__init__.py @@ -1,4 +1,4 @@ from .transform import * from .local_python import * from .transform_python import * -from .local import * \ No newline at end of file +from .local import * diff --git a/transforms/language/html2parquet/dpk_html2parquet/local_python.py b/transforms/language/html2parquet/dpk_html2parquet/local_python.py index 0de8cf3f8..9dba89052 100644 --- a/transforms/language/html2parquet/dpk_html2parquet/local_python.py +++ b/transforms/language/html2parquet/dpk_html2parquet/local_python.py @@ -35,7 +35,6 @@ "runtime_pipeline_id": "pipeline_id", "runtime_job_id": "job_id", "runtime_code_location": ParamsUtils.convert_to_ast(code_location), - } if __name__ == "__main__": # Set the simulated command line args diff --git a/transforms/language/html2parquet/dpk_html2parquet/ray/transform.py b/transforms/language/html2parquet/dpk_html2parquet/ray/transform.py index 48aba89eb..b1a4cadbe 100644 --- a/transforms/language/html2parquet/dpk_html2parquet/ray/transform.py +++ b/transforms/language/html2parquet/dpk_html2parquet/ray/transform.py @@ -16,12 +16,16 @@ from data_processing_ray.runtime.ray.runtime_configuration import ( RayTransformRuntimeConfiguration, ) -from dpk_html2parquet.transform import Html2ParquetTransform, Html2ParquetTransformConfiguration - +from dpk_html2parquet.transform import ( + Html2ParquetTransform, + Html2ParquetTransformConfiguration, +) from ray.util.metrics import Counter, Gauge + logger = get_logger(__name__) + class Html2ParquetRayTransform(Html2ParquetTransform): def __init__(self, config: dict): """ """ @@ -30,7 +34,8 @@ def __init__(self, config: dict): self.doc_counter = Counter("worker_html_doc_count", "Number of HTML documents converted by the worker") self.page_counter = Counter("worker_html_pages_count", "Number of HTML pages converted by the worker") self.page_convert_gauge = Gauge( - "worker_html_page_avg_convert_time", "Average time for converting a single HTML page on each worker" + "worker_html_page_avg_convert_time", + "Average time for converting a single HTML page on each worker", ) self.doc_convert_gauge = Gauge("worker_html_convert_time", "Time spent converting a single document") @@ -40,6 +45,7 @@ def _update_metrics(self, num_pages: int, elapse_time: float): self.doc_counter.inc(1) self.page_counter.inc(num_pages) + class Html2ParquetRayTransformConfiguration(RayTransformRuntimeConfiguration): """ Implements the RayTransformConfiguration for HTML2PARQUET as required by the RayTransformLauncher. @@ -57,4 +63,3 @@ def __init__(self): launcher = RayTransformLauncher(Html2ParquetRayTransformConfiguration()) logger.info("Launching html2parquet transform") launcher.launch() - diff --git a/transforms/language/html2parquet/dpk_html2parquet/transform.py b/transforms/language/html2parquet/dpk_html2parquet/transform.py index 30cbb7d95..3056613e3 100644 --- a/transforms/language/html2parquet/dpk_html2parquet/transform.py +++ b/transforms/language/html2parquet/dpk_html2parquet/transform.py @@ -31,8 +31,6 @@ # import data_processing - - class Html2ParquetTransform(AbstractBinaryTransform): def __init__(self, config: dict[str, Any]): super().__init__(config) @@ -126,7 +124,9 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl content_bytes = file.read() row_data = self.convert_html2parquet( - member_filename=member.filename, file_name=file_name, content_bytes=content_bytes + member_filename=member.filename, + file_name=file_name, + content_bytes=content_bytes, ) data.append(row_data) @@ -142,7 +142,9 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl content_bytes = buf.read() row_data = self.convert_html2parquet( - member_filename=None, file_name=file_name, content_bytes=content_bytes + member_filename=None, + file_name=file_name, + content_bytes=content_bytes, ) data.append(row_data) @@ -155,7 +157,6 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl return [(TransformUtils.convert_arrow_to_binary(table=table), ".parquet")], {"nrows": number_of_rows} - logger = get_logger(__name__) short_name = "html2parquet" @@ -252,19 +253,20 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab data = [] for batch in table.to_batches(): for row in batch.to_pylist(): - buf = io.BytesIO(bytes(row['contents'])) + buf = io.BytesIO(bytes(row["contents"])) # Read the content of the HTML file content_bytes = buf.read() row_data = self.convert_html2parquet( - member_filename=None, file_name=row['filename'], content_bytes=content_bytes + member_filename=None, + file_name=row["filename"], + content_bytes=content_bytes, ) data.append({**row, **row_data}) table = pa.Table.from_pylist(data) metadata = { - "columns" : table.schema.names, + "columns": table.schema.names, "nrows": len(table), } return [table], metadata - diff --git a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py index 1546c2d30..95d414ecf 100644 --- a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py +++ b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py @@ -28,6 +28,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -121,7 +122,11 @@ def html2parquet( # orchestrator runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # html2parquet parameters data_files_to_use: str = "['.html', '.zip']", html2parquet_output_format: str = "markdown", @@ -167,7 +172,10 @@ def html2parquet( """ # create clean_up task clean_up_task = cleanup_ray_op( - ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition diff --git a/transforms/language/html2parquet/test/test_html2parquet.py b/transforms/language/html2parquet/test/test_html2parquet.py index 71e92d7ec..c9a7dbfae 100644 --- a/transforms/language/html2parquet/test/test_html2parquet.py +++ b/transforms/language/html2parquet/test/test_html2parquet.py @@ -11,6 +11,7 @@ ################################################################################ import os + from data_processing.data_access.data_access_local import DataAccessLocal from data_processing.test_support import get_files_in_folder from data_processing.test_support.transform import AbstractBinaryTransformTest @@ -26,9 +27,7 @@ class TestHtml2ParquetTransform(AbstractBinaryTransformTest): def get_test_transform_fixtures(self) -> list[tuple]: dal = DataAccessLocal() - basedir = os.path.abspath( - os.path.join(os.path.dirname(__file__), "../test-data") - ) + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) input_dir = os.path.join(basedir, "input") input_files = get_files_in_folder(input_dir, ".html") input_files = [(name, binary) for name, binary in input_files.items()] @@ -37,13 +36,16 @@ def get_test_transform_fixtures(self) -> list[tuple]: config = {} expected_files = [ - os.path.join(basedir, "expected", TransformUtils.get_file_basename(input_file).replace(".html", ".parquet")) + os.path.join( + basedir, + "expected", + TransformUtils.get_file_basename(input_file).replace(".html", ".parquet"), + ) for input_file, _ in input_files ] expected_files = [ - (dal.get_file(name)[0], TransformUtils.get_file_extension(name)[1]) - for name in expected_files + (dal.get_file(name)[0], TransformUtils.get_file_extension(name)[1]) for name in expected_files ] return [ ( diff --git a/transforms/language/html2parquet/test/test_html2parquet_python.py b/transforms/language/html2parquet/test/test_html2parquet_python.py index ed52ba380..c3cb71196 100644 --- a/transforms/language/html2parquet/test/test_html2parquet_python.py +++ b/transforms/language/html2parquet/test/test_html2parquet_python.py @@ -19,6 +19,7 @@ ) from dpk_html2parquet.transform_python import Html2ParquetPythonTransformConfiguration + class TestPythonHtml2ParquetTransform(AbstractTransformLauncherTest): """ Extends the super-class to define the test data for the tests defined there. @@ -44,8 +45,6 @@ def get_test_transform_fixtures(self) -> list[tuple]: basedir + "/input", basedir + "/expected", ignore_columns, - ) ) return fixtures - diff --git a/transforms/language/html2parquet/test/test_html2parquet_ray.py b/transforms/language/html2parquet/test/test_html2parquet_ray.py index d4c886f2f..c97ec9952 100644 --- a/transforms/language/html2parquet/test/test_html2parquet_ray.py +++ b/transforms/language/html2parquet/test/test_html2parquet_ray.py @@ -43,7 +43,6 @@ def get_test_transform_fixtures(self) -> list[tuple]: basedir + "/expected", # this is added as a fixture to remove these columns from comparison ["date_acquired"], - ) ) return fixtures diff --git a/transforms/language/lang_id/dpk_lang_id/lang_models.py b/transforms/language/lang_id/dpk_lang_id/lang_models.py index dfbf01e65..fa2504b88 100644 --- a/transforms/language/lang_id/dpk_lang_id/lang_models.py +++ b/transforms/language/lang_id/dpk_lang_id/lang_models.py @@ -41,7 +41,10 @@ def detect_lang(self, text: str) -> tuple[str, float]: label, score = self.nlp.predict( text.replace("\n", " "), 1 ) # replace newline to avoid ERROR: predict processes one line at a time (remove '\n') skipping the file - return standardize_tag(label[0].replace("__label__", "")), math.floor(score[0] * 1000) / 1000 + return ( + standardize_tag(label[0].replace("__label__", "")), + math.floor(score[0] * 1000) / 1000, + ) class LangModelFactory: diff --git a/transforms/language/lang_id/dpk_lang_id/transform.py b/transforms/language/lang_id/dpk_lang_id/transform.py index 2d6f57923..bbf5495c2 100644 --- a/transforms/language/lang_id/dpk_lang_id/transform.py +++ b/transforms/language/lang_id/dpk_lang_id/transform.py @@ -56,7 +56,9 @@ def __init__(self, config: dict[str, Any]): # of LangIdentificationTransformConfiguration class super().__init__(config) self.nlp_langid = LangModelFactory.create_model( - config.get(model_kind_key), config.get(model_url_key), config.get(model_credential_key) + config.get(model_kind_key), + config.get(model_url_key), + config.get(model_credential_key), ) self.content_column_name = config.get(content_column_name_key, default_content_column_name) self.output_lang_column_name = config.get(output_lang_column_name_key, default_output_lang_column_name) @@ -89,7 +91,6 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab class LangIdentificationTransformConfiguration(TransformConfiguration): - """ Provides support for configuring and using the associated Transform class include configuration with CLI args. @@ -116,8 +117,16 @@ def add_input_params(self, parser: ArgumentParser) -> None: required=True, help="Credential to access model for language detection placed in url", ) - parser.add_argument(f"--{model_kind_cli_param}", required=True, help="Kind of model for language detection") - parser.add_argument(f"--{model_url_cli_param}", required=True, help="Url to model for language detection") + parser.add_argument( + f"--{model_kind_cli_param}", + required=True, + help="Kind of model for language detection", + ) + parser.add_argument( + f"--{model_url_cli_param}", + required=True, + help="Url to model for language detection", + ) parser.add_argument( f"--{content_column_name_cli_param}", default=default_content_column_name, diff --git a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py index 96d1f11ce..79152186b 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py @@ -28,6 +28,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -125,7 +126,11 @@ def lang_id( # orchestrator runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # lang_id parameters lang_id_model_credential: str = "PUT YOUR OWN HUGGINGFACE CREDENTIAL", lang_id_model_kind: str = "fasttext", @@ -179,7 +184,10 @@ def lang_id( """ # create clean_up task clean_up_task = cleanup_ray_op( - ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition diff --git a/transforms/language/lang_id/kfp_ray/lang_id_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_wf.py index 19f293a5d..7492e923b 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_wf.py @@ -126,7 +126,11 @@ def lang_id( # orchestrator runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # lang_id parameters lang_id_model_credential: str = "PUT YOUR OWN HUGGINGFACE CREDENTIAL", lang_id_model_kind: str = "fasttext", @@ -180,7 +184,10 @@ def lang_id( """ # create clean_up task clean_up_task = cleanup_ray_op( - ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition diff --git a/transforms/language/lang_id/test/test_nlp.py b/transforms/language/lang_id/test/test_nlp.py index e61b2cc1c..058b94ddd 100644 --- a/transforms/language/lang_id/test/test_nlp.py +++ b/transforms/language/lang_id/test/test_nlp.py @@ -17,7 +17,9 @@ def test_language_identification(): nlp_langid = LangModelFactory.create_model( - KIND_FASTTEXT, "facebook/fasttext-language-identification", "YOUR HUGGING FACE ACCOUNT TOKEN" + KIND_FASTTEXT, + "facebook/fasttext-language-identification", + "YOUR HUGGING FACE ACCOUNT TOKEN", ) documents = pa.array( [ diff --git a/transforms/language/pdf2parquet/dpk_pdf2parquet/ray/local_ray.py b/transforms/language/pdf2parquet/dpk_pdf2parquet/ray/local_ray.py index aeb049779..ff53615e3 100644 --- a/transforms/language/pdf2parquet/dpk_pdf2parquet/ray/local_ray.py +++ b/transforms/language/pdf2parquet/dpk_pdf2parquet/ray/local_ray.py @@ -15,6 +15,8 @@ import sys from data_processing.utils import ParamsUtils + + try: from data_processing_ray.runtime.ray import RayTransformLauncher from dpk_pdf2parquet.ray.transform import Pdf2ParquetRayTransformConfiguration diff --git a/transforms/language/pdf2parquet/dpk_pdf2parquet/ray/s3_ray.py b/transforms/language/pdf2parquet/dpk_pdf2parquet/ray/s3_ray.py index 63d48174a..9d05d9378 100644 --- a/transforms/language/pdf2parquet/dpk_pdf2parquet/ray/s3_ray.py +++ b/transforms/language/pdf2parquet/dpk_pdf2parquet/ray/s3_ray.py @@ -15,6 +15,8 @@ import sys from data_processing.utils import ParamsUtils + + try: from data_processing_ray.runtime.ray import RayTransformLauncher from dpk_pdf2parquet.ray.transform import Pdf2ParquetRayTransformConfiguration diff --git a/transforms/language/pdf2parquet/dpk_pdf2parquet/ray/transform.py b/transforms/language/pdf2parquet/dpk_pdf2parquet/ray/transform.py index 7167a8b4d..969a55955 100644 --- a/transforms/language/pdf2parquet/dpk_pdf2parquet/ray/transform.py +++ b/transforms/language/pdf2parquet/dpk_pdf2parquet/ray/transform.py @@ -41,7 +41,8 @@ def __init__(self, config: dict): self.doc_counter = Counter("worker_pdf_doc_count", "Number of PDF documents converted by the worker") self.page_counter = Counter("worker_pdf_pages_count", "Number of PDF pages converted by the worker") self.page_convert_gauge = Gauge( - "worker_pdf_page_avg_convert_time", "Average time for converting a single PDF page on each worker" + "worker_pdf_page_avg_convert_time", + "Average time for converting a single PDF page on each worker", ) self.doc_convert_gauge = Gauge("worker_pdf_convert_time", "Time spent converting a single document") diff --git a/transforms/language/pdf2parquet/dpk_pdf2parquet/transform.py b/transforms/language/pdf2parquet/dpk_pdf2parquet/transform.py index ddfbb5109..f0a91abc8 100644 --- a/transforms/language/pdf2parquet/dpk_pdf2parquet/transform.py +++ b/transforms/language/pdf2parquet/dpk_pdf2parquet/transform.py @@ -23,12 +23,15 @@ import filetype import pandas as pd + + try: from pandas.io.json import ujson_dumps except: from pandas.io.json import dumps as ujson_dumps -import pyarrow as pa + import numpy as np +import pyarrow as pa from data_processing.transform import AbstractBinaryTransform, TransformConfiguration from data_processing.utils import TransformUtils, get_logger, str2bool from data_processing.utils.cli_utils import CLIArgumentProvider @@ -103,18 +106,12 @@ def __str__(self): pdf2parquet_batch_size_cli_param = f"{cli_prefix}{pdf2parquet_batch_size_key}" pdf2parquet_artifacts_path_cli_param = f"{cli_prefix}{pdf2parquet_artifacts_path_key}" pdf2parquet_contents_type_cli_param = f"{cli_prefix}{pdf2parquet_contents_type_key}" -pdf2parquet_do_table_structure_cli_param = ( - f"{cli_prefix}{pdf2parquet_do_table_structure_key}" -) +pdf2parquet_do_table_structure_cli_param = f"{cli_prefix}{pdf2parquet_do_table_structure_key}" pdf2parquet_do_ocr_cli_param = f"{cli_prefix}{pdf2parquet_do_ocr_key}" -pdf2parquet_bitmap_area_threshold__cli_param = ( - f"{cli_prefix}{pdf2parquet_bitmap_area_threshold_key}" -) +pdf2parquet_bitmap_area_threshold__cli_param = f"{cli_prefix}{pdf2parquet_bitmap_area_threshold_key}" pdf2parquet_ocr_engine_cli_param = f"{cli_prefix}{pdf2parquet_ocr_engine_key}" pdf2parquet_pdf_backend_cli_param = f"{cli_prefix}{pdf2parquet_pdf_backend_key}" -pdf2parquet_double_precision_cli_param = ( - f"{cli_prefix}{pdf2parquet_double_precision_key}" -) +pdf2parquet_double_precision_cli_param = f"{cli_prefix}{pdf2parquet_double_precision_key}" class Pdf2ParquetTransform(AbstractBinaryTransform): @@ -134,32 +131,24 @@ def __init__(self, config: dict): self.artifacts_path = config.get(pdf2parquet_artifacts_path_key, None) if self.artifacts_path is not None: self.artifacts_path = Path(self.artifacts_path) - self.contents_type = config.get( - pdf2parquet_contents_type_key, pdf2parquet_contents_types.MARKDOWN - ) + self.contents_type = config.get(pdf2parquet_contents_type_key, pdf2parquet_contents_types.MARKDOWN) if not isinstance(self.contents_type, pdf2parquet_contents_types): self.contents_type = pdf2parquet_contents_types[self.contents_type] self.do_table_structure = config.get( pdf2parquet_do_table_structure_key, pdf2parquet_do_table_structure_default ) self.do_ocr = config.get(pdf2parquet_do_ocr_key, pdf2parquet_do_ocr_default) - self.ocr_engine_name = config.get( - pdf2parquet_ocr_engine_key, pdf2parquet_ocr_engine_default - ) + self.ocr_engine_name = config.get(pdf2parquet_ocr_engine_key, pdf2parquet_ocr_engine_default) if not isinstance(self.ocr_engine_name, pdf2parquet_ocr_engine): self.ocr_engine_name = pdf2parquet_ocr_engine[self.ocr_engine_name] self.bitmap_area_threshold = config.get( pdf2parquet_bitmap_area_threshold_key, pdf2parquet_bitmap_area_threshold_default, ) - self.pdf_backend_name = config.get( - pdf2parquet_pdf_backend_key, pdf2parquet_pdf_backend_default - ) + self.pdf_backend_name = config.get(pdf2parquet_pdf_backend_key, pdf2parquet_pdf_backend_default) if not isinstance(self.pdf_backend_name, pdf2parquet_pdf_backend): self.pdf_backend_name = pdf2parquet_pdf_backend[self.pdf_backend_name] - self.double_precision = config.get( - pdf2parquet_double_precision_key, pdf2parquet_double_precision_default - ) + self.double_precision = config.get(pdf2parquet_double_precision_key, pdf2parquet_double_precision_default) logger.info("Initializing models") pipeline_options = PdfPipelineOptions( @@ -172,9 +161,7 @@ def __init__(self, config: dict): lock = MultiLock("dpk_pdf2parquet_init") try: - logger.debug( - f"Going to acquire lock {lock.lock_filename} for synchronizing global filesystem operations." - ) + logger.debug(f"Going to acquire lock {lock.lock_filename} for synchronizing global filesystem operations.") locked = lock.acquire() logger.debug(f"Lock {lock.lock_filename} acquired.") @@ -190,7 +177,7 @@ def __init__(self, config: dict): finally: lock.release() logger.debug(f"Lock {lock.lock_filename} released.") - + self.buffer = [] def _get_ocr_engine(self, engine_name: pdf2parquet_ocr_engine) -> OcrOptions: @@ -217,9 +204,7 @@ def _update_metrics(self, num_pages: int, elapse_time: float): # This is implemented in the ray version pass - def _convert_pdf2parquet( - self, doc_filename: str, ext: str, content_bytes: bytes - ) -> dict: + def _convert_pdf2parquet(self, doc_filename: str, ext: str, content_bytes: bytes) -> dict: # Convert PDF to Markdown start_time = time.time() buf = io.BytesIO(content_bytes) @@ -234,15 +219,15 @@ def _convert_pdf2parquet( elif self.contents_type == pdf2parquet_contents_types.TEXT: content_string = doc.export_to_text() elif self.contents_type == pdf2parquet_contents_types.JSON: - content_string = ujson_dumps( - doc.export_to_dict(), double_precision=self.double_precision - ) + content_string = ujson_dumps(doc.export_to_dict(), double_precision=self.double_precision) else: raise RuntimeError(f"Uknown contents_type {self.contents_type}.") num_pages = len(doc.pages) num_tables = len(doc.tables) num_doc_elements = len(doc.texts) - document_hash = str(doc.origin.binary_hash) # we turn the uint64 hash into str, because it is easier to handle for pyarrow + document_hash = str( + doc.origin.binary_hash + ) # we turn the uint64 hash into str, because it is easier to handle for pyarrow self._update_metrics(num_pages=num_pages, elapse_time=elapse_time) @@ -263,9 +248,7 @@ def _convert_pdf2parquet( return file_data - def transform_binary( - self, file_name: str, byte_array: bytes - ) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: """ If file_name is detected as a PDF file, it generates a pyarrow table with a single row containing the document converted in markdown format. @@ -294,9 +277,7 @@ def transform_binary( doc_filename=file_name, ext=root_ext, content_bytes=byte_array ) - file_data["source_filename"] = TransformUtils.get_file_basename( - file_name - ) + file_data["source_filename"] = TransformUtils.get_file_basename(file_name) data.append(file_data) number_of_rows += 1 @@ -304,15 +285,11 @@ def transform_binary( except Exception as e: failed_doc_id.append(file_name) - logger.warning( - f"Exception {str(e)} processing file {file_name}, skipping" - ) + logger.warning(f"Exception {str(e)} processing file {file_name}, skipping") # Process ZIP archive of documents elif root_kind is not None and root_kind.mime == "application/zip": - logger.debug( - f"Detected root file {file_name=} as ZIP. Iterating through the archive content." - ) + logger.debug(f"Detected root file {file_name=} as ZIP. Iterating through the archive content.") with zipfile.ZipFile(io.BytesIO(byte_array)) as opened_zip: zip_namelist = opened_zip.namelist() @@ -342,9 +319,7 @@ def transform_binary( ext=ext, content_bytes=content_bytes, ) - file_data["source_filename"] = ( - TransformUtils.get_file_basename(file_name) - ) + file_data["source_filename"] = TransformUtils.get_file_basename(file_name) data.append(file_data) success_doc_id.append(archive_doc_filename) @@ -352,16 +327,13 @@ def transform_binary( except Exception as e: failed_doc_id.append(archive_doc_filename) - logger.warning( - f"Exception {str(e)} processing file {archive_doc_filename}, skipping" - ) + logger.warning(f"Exception {str(e)} processing file {archive_doc_filename}, skipping") else: logger.warning( f"File {file_name=} is not detected as a supported type nor as ZIP but {kind=}. Skipping." ) - metadata = { "nrows": number_of_rows, "nsuccess": len(success_doc_id), @@ -380,12 +352,17 @@ def transform_binary( num_left = len(data) start_row = 0 while num_left >= self.batch_size: - table = pa.Table.from_pylist(data[start_row:self.batch_size]) - batch_results.append((TransformUtils.convert_arrow_to_binary(table=table), ".parquet")) - + table = pa.Table.from_pylist(data[start_row : self.batch_size]) + batch_results.append( + ( + TransformUtils.convert_arrow_to_binary(table=table), + ".parquet", + ) + ) + start_row += self.batch_size num_left = num_left - self.batch_size - + if num_left >= 0: self.buffer = data[start_row:] @@ -412,9 +389,7 @@ class Pdf2ParquetTransformConfiguration(TransformConfiguration): configuration with CLI args and combining of metadata. """ - def __init__( - self, transform_class: type[AbstractBinaryTransform] = Pdf2ParquetTransform - ): + def __init__(self, transform_class: type[AbstractBinaryTransform] = Pdf2ParquetTransform): super().__init__( name=shortname, transform_class=transform_class, diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py index 220a72407..7e4c6e5e9 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py @@ -28,6 +28,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -123,7 +124,11 @@ def pdf2parquet( # orchestrator runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # pdf2parquet parameters pdf2parquet_batch_size: int = -1, pdf2parquet_do_table_structure: bool = True, @@ -175,7 +180,10 @@ def pdf2parquet( """ # create clean_up task clean_up_task = cleanup_ray_op( - ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py index 42a3a16a1..04268223a 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py @@ -28,6 +28,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -127,7 +128,11 @@ def pdf2parquet( # orchestrator runtime_actor_options: dict = {"num_cpus": 4}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # pdf2parquet parameters pdf2parquet_batch_size: int = -1, pdf2parquet_do_table_structure: bool = True, @@ -179,7 +184,10 @@ def pdf2parquet( """ # create clean_up task clean_up_task = cleanup_ray_op( - ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition diff --git a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py index d457470ba..aba021bd2 100644 --- a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py +++ b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py @@ -27,6 +27,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -115,7 +116,11 @@ def pii_redactor( # orchestrator runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # pii_redactor parameters pii_redactor_contents: str = "title", # additional parameters @@ -159,7 +164,10 @@ def pii_redactor( """ # create clean_up task clean_up_task = cleanup_ray_op( - ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition diff --git a/transforms/language/pii_redactor/python/src/pii_analyzer.py b/transforms/language/pii_redactor/python/src/pii_analyzer.py index 894c7ec35..edebf8b00 100644 --- a/transforms/language/pii_redactor/python/src/pii_analyzer.py +++ b/transforms/language/pii_redactor/python/src/pii_analyzer.py @@ -77,7 +77,10 @@ def analyze_text(self, text, language="en"): """ analyzer = AnalyzerEngine(nlp_engine=self.nlp_engine, registry=self.registry) analyze_results = analyzer.analyze( - text=text, language=language, entities=self.supported_entities, score_threshold=self.score_threshold + text=text, + language=language, + entities=self.supported_entities, + score_threshold=self.score_threshold, ) entity_types = [result.entity_type for result in analyze_results] return analyze_results, entity_types diff --git a/transforms/language/pii_redactor/python/src/pii_redactor_local.py b/transforms/language/pii_redactor/python/src/pii_redactor_local.py index baa6d3894..bbbd387c2 100644 --- a/transforms/language/pii_redactor/python/src/pii_redactor_local.py +++ b/transforms/language/pii_redactor/python/src/pii_redactor_local.py @@ -22,7 +22,10 @@ # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) -pii_config = {supported_entities_key: ["PERSON"], doc_transformed_contents_key: "new_contents"} +pii_config = { + supported_entities_key: ["PERSON"], + doc_transformed_contents_key: "new_contents", +} if __name__ == "__main__": # Here we show how to run outside of the runtime # Create and configure the transform. diff --git a/transforms/language/pii_redactor/python/src/pii_redactor_transform.py b/transforms/language/pii_redactor/python/src/pii_redactor_transform.py index 6c1d1c17f..1aa56efc3 100644 --- a/transforms/language/pii_redactor/python/src/pii_redactor_transform.py +++ b/transforms/language/pii_redactor/python/src/pii_redactor_transform.py @@ -44,7 +44,14 @@ The default score_threshold value will be 0.6. At this range false positives are reduced significantly """ -default_supported_entities = ["PERSON", "EMAIL_ADDRESS", "ORGANIZATION", "DATE_TIME", "CREDIT_CARD", "PHONE_NUMBER"] +default_supported_entities = [ + "PERSON", + "EMAIL_ADDRESS", + "ORGANIZATION", + "DATE_TIME", + "CREDIT_CARD", + "PHONE_NUMBER", +] """By default it supports person name, email address, organization, date, credti card, phone number.To know more about entities refer https://microsoft.github.io/presidio/supported_entities/ """ @@ -68,7 +75,8 @@ def __init__(self, config: dict): score_threshold_value = config.get(score_threshold_key, default_score_threshold_key) self.analyzer = PIIAnalyzerEngine( - supported_entities=self.supported_entities, score_threshold=score_threshold_value + supported_entities=self.supported_entities, + score_threshold=score_threshold_value, ) self.anonymizer = PIIAnonymizer(operator=self.redaction_operator.lower()) @@ -92,7 +100,10 @@ def transform(self, table: pa.Table, file_name: Optional[str] = None) -> tuple[l """ TransformUtils.validate_columns(table=table, required=[pii_contents_column]) - metadata = {"original_table_rows": table.num_rows, "original_column_count": len(table.column_names)} + metadata = { + "original_table_rows": table.num_rows, + "original_column_count": len(table.column_names), + } redacted_texts, entity_types_list = zip(*table[pii_contents_column].to_pandas().apply(self._redact_pii)) table = table.add_column(0, self.doc_contents_key, [redacted_texts]) diff --git a/transforms/language/pii_redactor/python/test/test_data.py b/transforms/language/pii_redactor/python/test/test_data.py index dcee489a9..fbf4941e6 100644 --- a/transforms/language/pii_redactor/python/test/test_data.py +++ b/transforms/language/pii_redactor/python/test/test_data.py @@ -42,11 +42,20 @@ 0, "transformed_contents", [ - ["My name is . Captain of the ship", "I work at and I like to eat apples", "My email is and dob is "], + [ + "My name is . Captain of the ship", + "I work at and I like to eat apples", + "My email is and dob is ", + ], ], ) redacted_expected_table = redacted_expected_table.add_column(0, "detected_pii", [detected_pii]) expected_metadata_list = [ - {"original_table_rows": 3, "original_column_count": 2, "transformed_table_rows": 3, "transformed_column_count": 4}, + { + "original_table_rows": 3, + "original_column_count": 2, + "transformed_table_rows": 3, + "transformed_column_count": 4, + }, {}, ] diff --git a/transforms/language/pii_redactor/python/test/test_pii_analyzer.py b/transforms/language/pii_redactor/python/test/test_pii_analyzer.py index 75d237218..1ab544772 100644 --- a/transforms/language/pii_redactor/python/test/test_pii_analyzer.py +++ b/transforms/language/pii_redactor/python/test/test_pii_analyzer.py @@ -18,7 +18,15 @@ def analyzer(): """ Fixture to initialize PIIAnalyzerEngine once per module. """ - supported_entities = ["PERSON", "EMAIL_ADDRESS", "DATE_TIME", "URL", "CREDIT_CARD", "PHONE_NUMBER", "LOCATION"] + supported_entities = [ + "PERSON", + "EMAIL_ADDRESS", + "DATE_TIME", + "URL", + "CREDIT_CARD", + "PHONE_NUMBER", + "LOCATION", + ] score_threshold = 0.6 return PIIAnalyzerEngine(supported_entities=supported_entities, score_threshold=score_threshold) diff --git a/transforms/language/pii_redactor/python/test/test_pii_redactor_redact_anonamize.py b/transforms/language/pii_redactor/python/test/test_pii_redactor_redact_anonamize.py index 71d3bfb25..bb3011340 100644 --- a/transforms/language/pii_redactor/python/test/test_pii_redactor_redact_anonamize.py +++ b/transforms/language/pii_redactor/python/test/test_pii_redactor_redact_anonamize.py @@ -30,7 +30,10 @@ def get_test_transform_fixtures(self) -> list[tuple]: fixtures = [ ( PIIRedactorTransform( - {doc_transformed_contents_key: doc_transformed_contents_key, redaction_operator_key: "redact"} + { + doc_transformed_contents_key: doc_transformed_contents_key, + redaction_operator_key: "redact", + } ), [table], [redacted_expected_table], diff --git a/transforms/language/pii_redactor/ray/test/test_pii_redactor_ray.py b/transforms/language/pii_redactor/ray/test/test_pii_redactor_ray.py index 360ed4286..1aee9427c 100644 --- a/transforms/language/pii_redactor/ray/test/test_pii_redactor_ray.py +++ b/transforms/language/pii_redactor/ray/test/test_pii_redactor_ray.py @@ -34,7 +34,10 @@ def get_test_transform_fixtures(self) -> list[tuple]: fixtures.append( ( launcher, - {doc_transformed_contents_cli_param: "new_contents", "run_locally": True}, + { + doc_transformed_contents_cli_param: "new_contents", + "run_locally": True, + }, basedir + "/input", basedir + "/expected", ) diff --git a/transforms/language/similarity/dpk_similarity/local.py b/transforms/language/similarity/dpk_similarity/local.py index 9bd434ef9..2b6e2f603 100644 --- a/transforms/language/similarity/dpk_similarity/local.py +++ b/transforms/language/similarity/dpk_similarity/local.py @@ -28,13 +28,13 @@ "es_endpoint": None, "es_index": "mydata", "annotation_column": "similarity_score", - "doc_text_column": "contents" + "doc_text_column": "contents", } if __name__ == "__main__": data_access = DataAccessLocal(local_conf) similarity_params["data_access"] = data_access - + # Use the local data access to read a parquet table. table, _ = data_access.get_table(os.path.join(input_folder, "test1.parquet")) print(f"input table: {table}") diff --git a/transforms/language/similarity/dpk_similarity/local_python.py b/transforms/language/similarity/dpk_similarity/local_python.py index 217fc8c33..cb445b5a7 100644 --- a/transforms/language/similarity/dpk_similarity/local_python.py +++ b/transforms/language/similarity/dpk_similarity/local_python.py @@ -38,7 +38,7 @@ "similarity_es_endpoint": None, "similarity_es_index": "mydata", "similarity_annotation_column": "similarity_score", - "similarity_doc_text_column": "contents" + "similarity_doc_text_column": "contents", } diff --git a/transforms/language/similarity/dpk_similarity/transform.py b/transforms/language/similarity/dpk_similarity/transform.py index dd1c2b1f9..8468896cf 100644 --- a/transforms/language/similarity/dpk_similarity/transform.py +++ b/transforms/language/similarity/dpk_similarity/transform.py @@ -10,18 +10,18 @@ # limitations under the License. ################################################################################ -import time -from argparse import ArgumentParser, Namespace import json import os +import time +from argparse import ArgumentParser, Namespace from typing import Any import pyarrow as pa +import requests from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider, TransformUtils +from requests.auth import HTTPBasicAuth -import requests -from requests.auth import HTTPBasicAuth short_name = "similarity" cli_prefix = f"{short_name}_" @@ -102,7 +102,7 @@ "required": False, "help": "Shingle size for query construction (default is 8)", }, - { + { "key": RESULT_SIZE_KEY, "cli_param": RESULT_SIZE_CLI_PARAM, "default": RESULT_SIZE_DEFAULT, @@ -125,11 +125,10 @@ "type": str, "required": False, "help": "The column name that contains the document text", - } + }, ] - class SimilarityTransform(AbstractTableTransform): """ Implements a simple copy of a pyarrow Table. @@ -154,26 +153,38 @@ def __init__(self, config: dict[str, Any]): self.annotation_column = config.get(ANNOTATION_COLUMN_KEY, ANNOTATION_COLUMN_DEFAULT) self.doc_text_column = config.get(DOC_TEXT_COLUMN_KEY, DOC_TEXT_COLUMN_DEFAULT) - - def _testElasticFuncioning(self): - url=self.es_endpoint - headers = {'Content-Type': 'application/json'} - res = requests.get(url=url, headers=headers, auth = HTTPBasicAuth(self.es_userid, self.es_pwd), verify=False) + def _testElasticFuncioning(self): + url = self.es_endpoint + headers = {"Content-Type": "application/json"} + res = requests.get( + url=url, + headers=headers, + auth=HTTPBasicAuth(self.es_userid, self.es_pwd), + verify=False, + ) if res.status_code != 200: print(f"ERROR: {res.text}") return False return True - def _excecuteQuery(self, query): if self._testElasticFuncioning(): - r = requests.post(url=f"{self.es_endpoint.rstrip('/')}/_search", json=query, auth = HTTPBasicAuth(self.es_userid, self.es_pwd), verify=False) + r = requests.post( + url=f"{self.es_endpoint.rstrip('/')}/_search", + json=query, + auth=HTTPBasicAuth(self.es_userid, self.es_pwd), + verify=False, + ) q = r.json() res = [] # try: for d in q["hits"]["hits"]: - ddd = {"id":d["fields"]["_id"][0],"index":d["fields"]["_index"][0], "score" :d["_score"]} + ddd = { + "id": d["fields"]["_id"][0], + "index": d["fields"]["_index"][0], + "score": d["_score"], + } ddd["contents"] = d["highlight"]["contents"] res.append(ddd) # except Exception as ex: @@ -181,7 +192,6 @@ def _excecuteQuery(self, query): return res return None - def _getNgramQuery(self, text): slop = 2 context = self.shingle_size @@ -189,13 +199,13 @@ def _getNgramQuery(self, text): # generate all possible shingles s = [] - spaces = [i for i,j in enumerate(document) if j==' '] - spaces.insert(0,0) + spaces = [i for i, j in enumerate(document) if j == " "] + spaces.insert(0, 0) end = len(spaces) - context if end > 0: for c in range(0, end): - s.append( document[spaces[c]:spaces[c+context]].strip()) - s.append( document[spaces[end]:len (document)].strip()) + s.append(document[spaces[c] : spaces[c + context]].strip()) + s.append(document[spaces[end] : len(document)].strip()) else: s.append(document.strip()) @@ -203,55 +213,32 @@ def _getNgramQuery(self, text): shingles = [] for ss in s: - shingles.append ( - { - "match_phrase": { - "contents": { - "query": ss, - "slop": slop - } - } - }) + shingles.append({"match_phrase": {"contents": {"query": ss, "slop": slop}}}) # construct elastic query - query = { - "size": self.result_size, - # "_source": "false", - "fields":["_id", "_index", "_score"], - "query": { - "bool": { - "filter": { - "exists": { - "field": "contents" + query = { + "size": self.result_size, + # "_source": "false", + "fields": ["_id", "_index", "_score"], + "query": { + "bool": { + "filter": {"exists": {"field": "contents"}}, + "must": [{"bool": {"should": shingles, "minimum_should_match": 1}}], + } + }, + "highlight": { + "pre_tags": [""], + "post_tags": [""], + "fields": { + "contents": { + "order": "score", + "fragment_size": 0, + "number_of_fragments": 1, } }, - "must": [ - { - "bool": { - "should": shingles, - "minimum_should_match": 1 - } - } - ] - } - }, - "highlight": { - "pre_tags": [ - "" - ], - "post_tags": [ - "" - ], - "fields": { - "contents": { - "order": "score", - "fragment_size": 0, - "number_of_fragments": 1} - } + }, } - } return query - def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: """ @@ -276,15 +263,13 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab with open(json_file_path, "r") as f: result_list = json.load(f) + assert len(self.df) == len( + result_list + ), "The number of rows in the dataframe does not match the number of elements in result_list." - assert len(self.df) == len(result_list), "The number of rows in the dataframe does not match the number of elements in result_list." - - - - self.df[self.annotation_column] = result_list print(self.df) - + table = pa.Table.from_pandas(self.df) self.logger.debug(f"Transformed one table with {len(table)} rows") @@ -293,7 +278,6 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab class SimilarityTransformConfiguration(TransformConfiguration): - """ Provides support for configuring and using the associated Transform class include configuration with CLI args. @@ -309,7 +293,6 @@ def __init__(self): self.logger = get_logger(__name__) - def add_input_params(self, parser: ArgumentParser) -> None: """ Add Transform-specific arguments to the given parser. @@ -323,10 +306,9 @@ def add_input_params(self, parser: ArgumentParser) -> None: type=param["type"], required=param["required"], default=param["default"], - help=param["help"] + help=param["help"], ) - def apply_input_params(self, args: Namespace) -> bool: """ Validate and apply the arguments that have been parsed @@ -340,17 +322,23 @@ def apply_input_params(self, args: Namespace) -> bool: try: shingle_size_int = int(shingle_size_value) if shingle_size_int <= 0: - print(f"Parameter '--{SHINGLE_SIZE_CLI_PARAM}' should be greater than 0. You specified '{shingle_size_value}'.") + print( + f"Parameter '--{SHINGLE_SIZE_CLI_PARAM}' should be greater than 0. You specified '{shingle_size_value}'." + ) return False except (ValueError, TypeError): - print(f"Parameter '--{SHINGLE_SIZE_CLI_PARAM}' should be an integer greater than 0. You specified '{shingle_size_value}'.") + print( + f"Parameter '--{SHINGLE_SIZE_CLI_PARAM}' should be an integer greater than 0. You specified '{shingle_size_value}'." + ) return False # Set endpoint to None if the input is empty or "None" (as a string) - if captured.get('es_endpoint') == "None" or (isinstance(captured.get('es_endpoint'), str) and len(captured.get('es_endpoint')) == 0): - captured['es_endpoint'] = None + if captured.get("es_endpoint") == "None" or ( + isinstance(captured.get("es_endpoint"), str) and len(captured.get("es_endpoint")) == 0 + ): + captured["es_endpoint"] = None self.params = self.params | captured - params_to_print = {k:v for k,v in self.params.items() if k != 'es_pwd'} + params_to_print = {k: v for k, v in self.params.items() if k != "es_pwd"} self.logger.info(f"{short_name} parameters are : {params_to_print}") return True diff --git a/transforms/language/similarity/dpk_similarity/transform_python.py b/transforms/language/similarity/dpk_similarity/transform_python.py index e9f076dc8..e98d71596 100644 --- a/transforms/language/similarity/dpk_similarity/transform_python.py +++ b/transforms/language/similarity/dpk_similarity/transform_python.py @@ -16,6 +16,8 @@ ) from data_processing.utils import get_logger from dpk_similarity.transform import SimilarityTransformConfiguration + + logger = get_logger(__name__) diff --git a/transforms/language/similarity/test/test_similarity.py b/transforms/language/similarity/test/test_similarity.py index 4720a1be2..b199a40ef 100644 --- a/transforms/language/similarity/test/test_similarity.py +++ b/transforms/language/similarity/test/test_similarity.py @@ -16,7 +16,8 @@ from data_processing.test_support.transform.table_transform_test import ( AbstractTableTransformTest, ) -from dpk_similarity.transform import SimilarityTransform, ES_ENDPOINT_KEY +from dpk_similarity.transform import ES_ENDPOINT_KEY, SimilarityTransform + # table = pa.Table.from_pydict({"name": pa.array(["Tom"]), "age": pa.array([23])}) # expected_table = table @@ -38,6 +39,11 @@ def get_test_transform_fixtures(self) -> list[tuple]: expected_metadata_list = [{"nrows": 8}, {}] config = {ES_ENDPOINT_KEY: None} fixtures = [ - (SimilarityTransform(config), input_tables, expected_tables, expected_metadata_list), + ( + SimilarityTransform(config), + input_tables, + expected_tables, + expected_metadata_list, + ), ] return fixtures diff --git a/transforms/language/similarity/test/test_similarity_python.py b/transforms/language/similarity/test/test_similarity_python.py index d90ec9380..5406077fa 100644 --- a/transforms/language/similarity/test/test_similarity_python.py +++ b/transforms/language/similarity/test/test_similarity_python.py @@ -16,10 +16,10 @@ from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) - from dpk_similarity.transform import ES_ENDPOINT_CLI_PARAM from dpk_similarity.transform_python import SimilarityPythonTransformConfiguration + class TestPythonSimilarityTransform(AbstractTransformLauncherTest): """ Extends the super-class to define the test data for the tests defined there. diff --git a/transforms/language/similarity/tools/load_elasticsearch/set_up_elasticsearch.py b/transforms/language/similarity/tools/load_elasticsearch/set_up_elasticsearch.py index 4a7f936c0..8211cb937 100644 --- a/transforms/language/similarity/tools/load_elasticsearch/set_up_elasticsearch.py +++ b/transforms/language/similarity/tools/load_elasticsearch/set_up_elasticsearch.py @@ -11,24 +11,26 @@ ################################################################################ import csv -import requests -from requests.auth import HTTPBasicAuth import json import os import sys -from sentence_transformers import SentenceTransformer + +import requests from dotenv import load_dotenv +from requests.auth import HTTPBasicAuth +from sentence_transformers import SentenceTransformer + # Load environment variables from .env if present load_dotenv() # Optional values for these may come from the .env file -elastic_host = os.getenv('ELASTIC_HOST', '') -elastic_user = os.getenv('ELASTIC_USER', 'elastic') -elastic_password = os.getenv('ELASTIC_PASSWORD', 'elastic') -index_name = os.getenv('INDEX_NAME', 'mydata') -csv_path = os.getenv('CSV_PATH', os.path.join(os.path.dirname(os.path.realpath(__file__)), "mydata.csv")) -CONTENTS_COLUMN = os.getenv('contents', 'contents') +elastic_host = os.getenv("ELASTIC_HOST", "") +elastic_user = os.getenv("ELASTIC_USER", "elastic") +elastic_password = os.getenv("ELASTIC_PASSWORD", "elastic") +index_name = os.getenv("INDEX_NAME", "mydata") +csv_path = os.getenv("CSV_PATH", os.path.join(os.path.dirname(os.path.realpath(__file__)), "mydata.csv")) +CONTENTS_COLUMN = os.getenv("contents", "contents") CREATE_INDEX = False # If True, attempt to create index if not exists BATCH_SIZE = 1000 @@ -37,15 +39,15 @@ print("Error: Missing required configuration. Ensure elastic_host, index_name, and csv_path are set.") sys.exit(1) -bulk_url = f'{elastic_host}/_bulk' -headers = { 'Content-Type': 'application/x-ndjson' } +bulk_url = f"{elastic_host}/_bulk" +headers = {"Content-Type": "application/x-ndjson"} elastic_auth = HTTPBasicAuth(elastic_user, elastic_password) mapping_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "mapping.json") with open(mapping_file_path, "r") as f: mappings = json.load(f) -model = SentenceTransformer('paraphrase-MiniLM-L6-v2') +model = SentenceTransformer("paraphrase-MiniLM-L6-v2") csv.field_size_limit(sys.maxsize) @@ -57,7 +59,13 @@ def index_exists(index): def create_index(index, mapping): """Create the Elasticsearch index with given mapping.""" - r = requests.put(f"{elastic_host}/{index}", auth=elastic_auth, headers={'Content-Type': 'application/json'}, json=mapping, verify=False) + r = requests.put( + f"{elastic_host}/{index}", + auth=elastic_auth, + headers={"Content-Type": "application/json"}, + json=mapping, + verify=False, + ) if r.status_code not in [200, 201]: print(f"Error creating index {index}: {r.status_code} {r.text}") sys.exit(1) @@ -81,7 +89,7 @@ def verify_and_setup_index(index): def read_csv(csv_file_path): """Read CSV and yield rows. Verify CONTENTS_COLUMN column.""" - with open(csv_file_path, "r", encoding='utf-8') as csvfile: + with open(csv_file_path, "r", encoding="utf-8") as csvfile: reader = csv.DictReader(csvfile) if CONTENTS_COLUMN not in reader.fieldnames: print(f"Error: '{CONTENTS_COLUMN}' column not found in CSV. Exiting.") @@ -102,7 +110,7 @@ def generate_embeddings_and_push(index, rows, batch_size): action = {"index": {"_index": index}} doc = { "contents": contents, - "sentence_transformers_384": sentence_transformers_384 + "sentence_transformers_384": sentence_transformers_384, } buffer.append(json.dumps(action)) @@ -129,11 +137,11 @@ def push_batch(batch): print(f"Error pushing batch: {r.status_code}\n{r.text}") sys.exit(1) response_json = r.json() - if response_json.get('errors', False): + if response_json.get("errors", False): print("Bulk indexing encountered errors:") - for item in response_json.get('items', []): - if item.get('index', {}).get('error'): - print(item.get('index', {}).get('error')) + for item in response_json.get("items", []): + if item.get("index", {}).get("error"): + print(item.get("index", {}).get("error")) sys.exit(1) @@ -151,4 +159,4 @@ def push_batch(batch): action_desc = "created" if CREATE_INDEX else "verified" print(f"Index '{index_name}' {action_desc}.") print(f"Indexed {total_docs_indexed} documents into '{index_name}'.") - print("Operation complete.") \ No newline at end of file + print("Operation complete.") diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py index 13fc9eef6..a7ad529f1 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py @@ -28,6 +28,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -102,7 +103,14 @@ def text_encoder( ray_name: str = "text-encoder-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access data_s3_config: str = "[{'input_folder': 'test/text_encoder/input/', 'output_folder': 'test/text_encoder/output/'}]", @@ -110,9 +118,13 @@ def text_encoder( data_max_files: int = -1, data_num_samples: int = -1, # orchestrator - runtime_actor_options: dict = {'num_cpus': 0.8}, + runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # text_encoder parameters text_encoder_model_name: str = "BAAI/bge-small-en-v1.5", text_encoder_content_column_name: str = "contents", @@ -159,7 +171,12 @@ def text_encoder( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py index a95343874..91bbbf349 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py @@ -28,6 +28,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -103,7 +104,14 @@ def text_encoder( ray_name: str = "text-encoder-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access data_s3_config: str = "{'input_folder': 'test/text_encoder/input/', 'output_folder': 'test/text_encoder/output/'}", @@ -111,9 +119,13 @@ def text_encoder( data_max_files: int = -1, data_num_samples: int = -1, # orchestrator - runtime_actor_options: dict = {'num_cpus': 0.8}, + runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # text_encoder parameters text_encoder_model_name: str = "BAAI/bge-small-en-v1.5", text_encoder_content_column_name: str = "contents", @@ -160,7 +172,12 @@ def text_encoder( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): diff --git a/transforms/universal/doc_id/dpk_doc_id/ray/transform.py b/transforms/universal/doc_id/dpk_doc_id/ray/transform.py index 4c48ddeb2..934446a50 100644 --- a/transforms/universal/doc_id/dpk_doc_id/ray/transform.py +++ b/transforms/universal/doc_id/dpk_doc_id/ray/transform.py @@ -73,7 +73,10 @@ def __init__(self, params: dict[str, Any]): self.id_generator = None def get_transform_config( - self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str] + self, + data_access_factory: DataAccessFactoryBase, + statistics: ActorHandle, + files: list[str], ) -> dict[str, Any]: """ Set environment for filter execution @@ -104,7 +107,10 @@ def __init__(self): class DocIDRayTransformRuntimeConfiguration(RayTransformRuntimeConfiguration): def __init__(self): - super().__init__(transform_config=DocIDRayTransformConfiguration(), runtime_class=DocIDRayRuntime) + super().__init__( + transform_config=DocIDRayTransformConfiguration(), + runtime_class=DocIDRayRuntime, + ) # Class used by the notebooks to ingest binary files and create parquet files diff --git a/transforms/universal/doc_id/dpk_doc_id/spark/transform.py b/transforms/universal/doc_id/dpk_doc_id/spark/transform.py index 4af3429b3..d13e97247 100644 --- a/transforms/universal/doc_id/dpk_doc_id/spark/transform.py +++ b/transforms/universal/doc_id/dpk_doc_id/spark/transform.py @@ -88,7 +88,6 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab class DocIDTransformConfiguration(TransformConfiguration): - """ Provides support for configuring and using the associated Transform class include configuration with CLI args and combining of metadata. @@ -111,7 +110,10 @@ def add_input_params(self, parser: ArgumentParser) -> None: (e.g, noop_, pii_, etc.) """ parser.add_argument( - f"--{doc_column_name_cli_param}", type=str, default=doc_column_name_default, help="doc column name" + f"--{doc_column_name_cli_param}", + type=str, + default=doc_column_name_default, + help="doc column name", ) parser.add_argument( f"--{hash_column_name_cli_param}", @@ -151,7 +153,10 @@ def __init__(self, params: dict[str, Any]): super().__init__(params) def get_transform_config( - self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics + self, + partition: int, + data_access_factory: DataAccessFactoryBase, + statistics: TransformStatistics, ) -> dict[str, Any]: """ Get the dictionary of configuration that will be provided to the transform's initializer. @@ -177,7 +182,10 @@ def __init__(self): """ Initialization """ - super().__init__(transform_config=DocIDTransformConfiguration(), runtime_class=DocIDSparkTransformRuntime) + super().__init__( + transform_config=DocIDTransformConfiguration(), + runtime_class=DocIDSparkTransformRuntime, + ) if __name__ == "__main__": diff --git a/transforms/universal/doc_id/dpk_doc_id/transform.py b/transforms/universal/doc_id/dpk_doc_id/transform.py index 08d316f48..c26117726 100644 --- a/transforms/universal/doc_id/dpk_doc_id/transform.py +++ b/transforms/universal/doc_id/dpk_doc_id/transform.py @@ -118,7 +118,6 @@ def _get_starting_id(self, n_rows: int) -> int: class DocIDTransformConfigurationBase(TransformConfiguration): - """ Provides support for configuring and using the associated Transform class include configuration with CLI args and combining of metadata. @@ -141,7 +140,10 @@ def add_input_params(self, parser: ArgumentParser) -> None: (e.g, noop_, pii_, etc.) """ parser.add_argument( - f"--{doc_column_name_cli_param}", type=str, default=doc_column_name_default, help="doc column name" + f"--{doc_column_name_cli_param}", + type=str, + default=doc_column_name_default, + help="doc column name", ) parser.add_argument( f"--{hash_column_name_cli_param}", diff --git a/transforms/universal/doc_id/dpk_doc_id/transform_python.py b/transforms/universal/doc_id/dpk_doc_id/transform_python.py index f97ace554..38c86f931 100644 --- a/transforms/universal/doc_id/dpk_doc_id/transform_python.py +++ b/transforms/universal/doc_id/dpk_doc_id/transform_python.py @@ -81,7 +81,10 @@ def __init__(self, params: dict[str, Any]): self.id_generator = None def get_transform_config( - self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str] + self, + data_access_factory: DataAccessFactoryBase, + statistics: TransformStatistics, + files: list[str], ) -> dict[str, Any]: """ Get the dictionary of configuration that will be provided to the transform's initializer. diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index 6cff86b85..073e9339f 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -27,6 +27,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -129,7 +130,11 @@ def doc_id( # orchestrator runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # doc id parameters doc_id_doc_column: str = "contents", doc_id_hash_column: str = "hash_column", @@ -179,7 +184,10 @@ def doc_id( """ # create clean_up task clean_up_task = cleanup_ray_op( - ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition diff --git a/transforms/universal/doc_id/test/test_doc_id_spark.py b/transforms/universal/doc_id/test/test_doc_id_spark.py index 3e3fd2cc7..8ce08e9bb 100644 --- a/transforms/universal/doc_id/test/test_doc_id_spark.py +++ b/transforms/universal/doc_id/test/test_doc_id_spark.py @@ -41,5 +41,12 @@ def get_test_transform_fixtures(self) -> list[tuple]: int_column_name_cli_param: "int_id_column", } - fixtures.append((launcher, transform_config, basedir + "/input", basedir + "/expected-spark")) + fixtures.append( + ( + launcher, + transform_config, + basedir + "/input", + basedir + "/expected-spark", + ) + ) return fixtures diff --git a/transforms/universal/ededup/dpk_ededup/local.py b/transforms/universal/ededup/dpk_ededup/local.py index f56ba2a1a..cb546c7c8 100644 --- a/transforms/universal/ededup/dpk_ededup/local.py +++ b/transforms/universal/ededup/dpk_ededup/local.py @@ -29,7 +29,11 @@ "output_folder": output_folder, } -ededup_params = {doc_column_name_key: "contents", int_column_name_key: "document_id", "filter": HashFilter({})} +ededup_params = { + doc_column_name_key: "contents", + int_column_name_key: "document_id", + "filter": HashFilter({}), +} if __name__ == "__main__": # Here we show how to run outside of ray diff --git a/transforms/universal/ededup/dpk_ededup/ray/transform.py b/transforms/universal/ededup/dpk_ededup/ray/transform.py index ab12034d7..b9f6046e0 100644 --- a/transforms/universal/ededup/dpk_ededup/ray/transform.py +++ b/transforms/universal/ededup/dpk_ededup/ray/transform.py @@ -109,7 +109,10 @@ def __init__(self, params: dict[str, Any]): self.logger = get_logger(__name__) def get_transform_config( - self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str] + self, + data_access_factory: DataAccessFactoryBase, + statistics: ActorHandle, + files: list[str], ) -> dict[str, Any]: """ Set environment for transform execution @@ -199,7 +202,11 @@ def compute_execution_stats(self, stats: dict[str, Any]) -> dict[str, Any]: # Wait for replies ready, not_ready = ray.wait(remote_replies) remote_replies = not_ready - return {"number of hashes": sum_hash, "hash memory, GB": sum_hash_mem, "de duplication %": dedup_prst} | stats + return { + "number of hashes": sum_hash, + "hash memory, GB": sum_hash_mem, + "de duplication %": dedup_prst, + } | stats class EdedupRayTransformConfiguration(EdedupTransformConfigurationBase): @@ -216,8 +223,18 @@ def add_input_params(self, parser: ArgumentParser) -> None: Add Transform-specific arguments to the given parser. """ super().add_input_params(parser) - parser.add_argument(f"--{hash_cpu_cli_params}", type=float, default=0.5, help="number of CPUs per hash") - parser.add_argument(f"--{num_hashes_cli_params}", type=int, default=0, help="number of hash actors to use") + parser.add_argument( + f"--{hash_cpu_cli_params}", + type=float, + default=0.5, + help="number of CPUs per hash", + ) + parser.add_argument( + f"--{num_hashes_cli_params}", + type=int, + default=0, + help="number of hash actors to use", + ) def apply_input_params(self, args: Namespace) -> bool: """ @@ -234,7 +251,10 @@ def apply_input_params(self, args: Namespace) -> bool: class EdedupRayTransformRuntimeConfiguration(RayTransformRuntimeConfiguration): def __init__(self): - super().__init__(transform_config=EdedupRayTransformConfiguration(), runtime_class=EdedupRayRuntime) + super().__init__( + transform_config=EdedupRayTransformConfiguration(), + runtime_class=EdedupRayRuntime, + ) # Class used by the notebooks to ingest binary files and create parquet files diff --git a/transforms/universal/ededup/dpk_ededup/transform_base.py b/transforms/universal/ededup/dpk_ededup/transform_base.py index 6243adeef..d256c6adc 100644 --- a/transforms/universal/ededup/dpk_ededup/transform_base.py +++ b/transforms/universal/ededup/dpk_ededup/transform_base.py @@ -107,7 +107,8 @@ def snapshot(self) -> None: b_doc = pickle.dumps(self.hashes) # Save it self.data_access.save_file( - f"{SnapshotUtils.get_snapshot_folder(self.data_access)}hash_collector_{self.actor_id}", b_doc + f"{SnapshotUtils.get_snapshot_folder(self.data_access)}hash_collector_{self.actor_id}", + b_doc, ) except Exception as e: self.logger.warning(f"Failed to snapshot doc collector {self.actor_id} with exception {e}") @@ -181,7 +182,10 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab removed_column[0] = removed out_table = TransformUtils.add_column(table=out_table, name="removed", content=removed_column) # report statistics - stats = {"source_documents": table.num_rows, "result_documents": out_table.num_rows} + stats = { + "source_documents": table.num_rows, + "result_documents": out_table.num_rows, + } return [out_table], stats def _process_cached_hashes(self, hd: dict[str, str]) -> list[str]: @@ -233,7 +237,10 @@ def add_input_params(self, parser: ArgumentParser) -> None: # by default, snapshot file is from the output directory. This parameter can overwrite # default location by explicitly defining the snapshot directory parser.add_argument( - f"--{snapshot_directory_cli_param}", type=str, default=None, help="location of snapshot files" + f"--{snapshot_directory_cli_param}", + type=str, + default=None, + help="location of snapshot files", ) def apply_input_params(self, args: Namespace) -> bool: diff --git a/transforms/universal/ededup/dpk_ededup/transform_python.py b/transforms/universal/ededup/dpk_ededup/transform_python.py index d412153ac..792f0caf9 100644 --- a/transforms/universal/ededup/dpk_ededup/transform_python.py +++ b/transforms/universal/ededup/dpk_ededup/transform_python.py @@ -70,7 +70,10 @@ def __init__(self, params: dict[str, Any]): self.logger = get_logger(__name__) def get_transform_config( - self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str] + self, + data_access_factory: DataAccessFactoryBase, + statistics: TransformStatistics, + files: list[str], ) -> dict[str, Any]: """ Get the dictionary of configuration that will be provided to the transform's initializer. @@ -93,7 +96,13 @@ def get_transform_config( else: snapshot_path = f"{snapshot_path}/hash_collector_1" self.logger.info(f"continuing from the hash snapshot {snapshot_path}") - self.filter = HashFilter({"data_access_factory": data_access_factory, "id": 1, "snapshot": snapshot_path}) + self.filter = HashFilter( + { + "data_access_factory": data_access_factory, + "id": 1, + "snapshot": snapshot_path, + } + ) else: self.logger.info("Starting from the beginning") self.filter = HashFilter({"data_access_factory": data_access_factory, "id": 1}) diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index 29adf5c18..966b2b3d9 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -91,7 +91,11 @@ def ededup( # orchestrator runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # ededup ededup_hash_cpu: float = 0.5, ededup_doc_column: str = "contents", @@ -144,7 +148,10 @@ def ededup( """ # create clean_up task clean_up_task = cleanup_ray_op( - ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition diff --git a/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py b/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py index 6f8197877..076ec2dec 100644 --- a/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py +++ b/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py @@ -73,7 +73,13 @@ def ededup_compute_execution_params( # S3 config is list. take the first element s3_config = s3_config[0] # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly - data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1) + data_access = DataAccessS3( + s3_credentials=s3_creds, + s3_config=s3_config, + d_sets=None, + checkpoint=False, + m_files=-1, + ) # sample input data sampling, _ = data_access.sample_input_data(n_samples=ededup_n_samples) avg_doc_size = sampling.get("average doc size KB") diff --git a/transforms/universal/ededup/test/test_ededup.py b/transforms/universal/ededup/test/test_ededup.py index 6c9d85f23..94c1e3cca 100644 --- a/transforms/universal/ededup/test/test_ededup.py +++ b/transforms/universal/ededup/test/test_ededup.py @@ -34,8 +34,17 @@ def get_test_transform_fixtures(self) -> list[Tuple]: input_dir = os.path.join(basedir, "input") input_tables = get_tables_in_folder(input_dir) expected_metadata_list = [{"result_documents": 3, "source_documents": 5}, {}] - config = {doc_column_name_key: "contents", int_column_name_key: "document_id", "filter": HashFilter({})} + config = { + doc_column_name_key: "contents", + int_column_name_key: "document_id", + "filter": HashFilter({}), + } expected_tables = get_tables_in_folder(os.path.join(basedir, "expected")) return [ - (EdedupTransform(config), input_tables, expected_tables, expected_metadata_list), + ( + EdedupTransform(config), + input_tables, + expected_tables, + expected_metadata_list, + ), ] diff --git a/transforms/universal/ededup/test/test_ededup_python.py b/transforms/universal/ededup/test/test_ededup_python.py index d98a61ff3..6780eb4a3 100644 --- a/transforms/universal/ededup/test/test_ededup_python.py +++ b/transforms/universal/ededup/test/test_ededup_python.py @@ -34,5 +34,8 @@ def get_test_transform_fixtures(self) -> list[tuple]: fixtures = [] basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) launcher = PythonTransformLauncher(EdedupPythonTransformRuntimeConfiguration()) - config = {doc_column_name_cli_param: "contents", int_column_name_cli_param: "document_id"} + config = { + doc_column_name_cli_param: "contents", + int_column_name_cli_param: "document_id", + } return [(launcher, config, basedir + "/input", basedir + "/expected")] diff --git a/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/local_python.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/local_python.py index 61302b74a..496090de5 100644 --- a/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/local_python.py +++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/local_python.py @@ -13,16 +13,23 @@ import os import sys +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils from dpk_fdedup.cluster_analysis.transform_python import ( ClusterAnalysisPythonTransformConfiguration, ) -from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.utils import ParamsUtils # create parameters input_folder = os.path.abspath( - os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands") + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected", + "signature_calc", + "bands", + ) ) output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) local_conf = { diff --git a/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/local.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/local.py index a4ec84741..d4d06a1b9 100644 --- a/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/local.py +++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/local.py @@ -13,14 +13,23 @@ import os import sys -from dpk_fdedup.cluster_analysis.ray.transform import ClusterAnalysisRayTransformConfiguration from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher +from dpk_fdedup.cluster_analysis.ray.transform import ( + ClusterAnalysisRayTransformConfiguration, +) # create parameters input_folder = os.path.abspath( - os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands") + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected", + "signature_calc", + "bands", + ) ) output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) local_conf = { diff --git a/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/transform.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/transform.py index 10b850192..8567a4575 100644 --- a/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/transform.py @@ -13,11 +13,6 @@ import os from typing import Any -from dpk_fdedup.cluster_analysis.transform import ( - ClusterAnalysisTransformConfiguration, - num_bands_key, - num_segments_key, -) from data_processing.data_access import DataAccess from data_processing.utils import CLIArgumentProvider, get_logger from data_processing_ray.runtime.ray import ( @@ -25,6 +20,11 @@ RayTransformLauncher, RayTransformRuntimeConfiguration, ) +from dpk_fdedup.cluster_analysis.transform import ( + ClusterAnalysisTransformConfiguration, + num_bands_key, + num_segments_key, +) logger = get_logger(__name__) diff --git a/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/spark/local.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/spark/local.py index 408220b6b..0aacf0fd1 100644 --- a/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/spark/local.py +++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/spark/local.py @@ -14,14 +14,23 @@ import sys import polars as pl -from dpk_fdedup.cluster_analysis.spark.transform import ClusterAnalysisSparkTransformConfiguration from data_processing.utils import ParamsUtils from data_processing_spark.runtime.spark import SparkTransformLauncher +from dpk_fdedup.cluster_analysis.spark.transform import ( + ClusterAnalysisSparkTransformConfiguration, +) # create parameters input_folder = os.path.abspath( - os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands") + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected", + "signature_calc", + "bands", + ) ) output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) local_conf = { diff --git a/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/spark/transform.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/spark/transform.py index 97ab7a48f..b301be199 100644 --- a/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/spark/transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/spark/transform.py @@ -13,11 +13,6 @@ import os from typing import Any -from dpk_fdedup.cluster_analysis.transform import ( - ClusterAnalysisTransformConfiguration, - num_bands_key, - num_segments_key, -) from data_processing.data_access import DataAccess from data_processing.utils import get_logger from data_processing_spark.runtime.spark import ( @@ -25,6 +20,11 @@ SparkTransformLauncher, SparkTransformRuntimeConfiguration, ) +from dpk_fdedup.cluster_analysis.transform import ( + ClusterAnalysisTransformConfiguration, + num_bands_key, + num_segments_key, +) logger = get_logger(__name__) diff --git a/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py index b414adaa6..56bbeaef5 100644 --- a/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py @@ -196,7 +196,11 @@ def _get_clusters(self, band_segment_dataframe: pl.DataFrame) -> tuple[pl.DataFr def _analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]: # Define the schema with specific data types - schema = {"first_doc": pl.Int64, "docs_to_remove": pl.List(pl.Int64), "docs_to_remove_length": pl.Int64} + schema = { + "first_doc": pl.Int64, + "docs_to_remove": pl.List(pl.Int64), + "docs_to_remove_length": pl.Int64, + } doc_ids_lists = [] docs_to_remove_lists = [] len_of_docs2remove_lists = [] @@ -277,7 +281,6 @@ def _jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]: class ClusterAnalysisTransformConfiguration(TransformConfiguration): - """ Provides support for configuring and using the associated Transform class include configuration with CLI args. diff --git a/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform_python.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform_python.py index e882ea6cc..ec5cae06e 100644 --- a/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform_python.py +++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform_python.py @@ -14,11 +14,6 @@ import time from typing import Any -from dpk_fdedup.cluster_analysis.transform import ( - ClusterAnalysisTransformConfiguration, - num_bands_key, - num_segments_key, -) from data_processing.data_access import DataAccess from data_processing.runtime.pure_python import ( DefaultPythonTransformRuntime, @@ -26,6 +21,11 @@ PythonTransformRuntimeConfiguration, ) from data_processing.utils import get_logger +from dpk_fdedup.cluster_analysis.transform import ( + ClusterAnalysisTransformConfiguration, + num_bands_key, + num_segments_key, +) logger = get_logger(__name__) diff --git a/transforms/universal/fdedup/dpk_fdedup/data_cleaning/local_python.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/local_python.py index d0976ec76..c853e80de 100644 --- a/transforms/universal/fdedup/dpk_fdedup/data_cleaning/local_python.py +++ b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/local_python.py @@ -13,13 +13,15 @@ import os import sys +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils from dpk_fdedup.data_cleaning.transform import ( document_id_column_cli_param, duplicate_list_location_cli_param, ) -from dpk_fdedup.data_cleaning.transform_python import DataCleaningPythonTransformConfiguration -from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.utils import ParamsUtils +from dpk_fdedup.data_cleaning.transform_python import ( + DataCleaningPythonTransformConfiguration, +) # create parameters diff --git a/transforms/universal/fdedup/dpk_fdedup/data_cleaning/ray/local.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/ray/local.py index f72fc0902..8baa3946a 100644 --- a/transforms/universal/fdedup/dpk_fdedup/data_cleaning/ray/local.py +++ b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/ray/local.py @@ -13,13 +13,16 @@ import os import sys +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher from dpk_fdedup.data_cleaning.transform import ( document_id_column_cli_param, duplicate_list_location_cli_param, ) -from transforms.universal.fdedup.dpk_fdedup.data_cleaning.ray.transform import DataCleaningRayTransformConfiguration -from data_processing.utils import ParamsUtils -from data_processing_ray.runtime.ray import RayTransformLauncher + +from transforms.universal.fdedup.dpk_fdedup.data_cleaning.ray.transform import ( + DataCleaningRayTransformConfiguration, +) # create parameters diff --git a/transforms/universal/fdedup/dpk_fdedup/data_cleaning/ray/transform.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/ray/transform.py index 4a4bd52f0..67e12751c 100644 --- a/transforms/universal/fdedup/dpk_fdedup/data_cleaning/ray/transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/ray/transform.py @@ -14,14 +14,6 @@ from typing import Any import ray -from dpk_fdedup.data_cleaning.transform import ( - DataCleaningTransform, - DataCleaningTransformConfiguration, - dataclean_data_access_key, - dataclean_data_factory_key, - duplicate_list_location_default, - duplicate_list_location_key, -) from data_processing.data_access import DataAccessFactoryBase from data_processing.utils import CLIArgumentProvider, get_logger from data_processing_ray.runtime.ray import ( @@ -31,6 +23,14 @@ from data_processing_ray.runtime.ray.runtime_configuration import ( RayTransformRuntimeConfiguration, ) +from dpk_fdedup.data_cleaning.transform import ( + DataCleaningTransform, + DataCleaningTransformConfiguration, + dataclean_data_access_key, + dataclean_data_factory_key, + duplicate_list_location_default, + duplicate_list_location_key, +) from ray.actor import ActorHandle diff --git a/transforms/universal/fdedup/dpk_fdedup/data_cleaning/spark/local.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/spark/local.py index 12c5ab244..5042894af 100644 --- a/transforms/universal/fdedup/dpk_fdedup/data_cleaning/spark/local.py +++ b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/spark/local.py @@ -18,10 +18,13 @@ document_id_column_cli_param, duplicate_list_location_cli_param, ) -from transforms.universal.fdedup.dpk_fdedup.data_cleaning.spark.transform import DataCleaningSparkTransformConfiguration from data_processing.utils import ParamsUtils from data_processing_spark.runtime.spark import SparkTransformLauncher +from transforms.universal.fdedup.dpk_fdedup.data_cleaning.spark.transform import ( + DataCleaningSparkTransformConfiguration, +) + # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) diff --git a/transforms/universal/fdedup/dpk_fdedup/data_cleaning/spark/transform.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/spark/transform.py index 16d184a5e..ce18029ee 100644 --- a/transforms/universal/fdedup/dpk_fdedup/data_cleaning/spark/transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/spark/transform.py @@ -13,13 +13,6 @@ import os from typing import Any -from dpk_fdedup.data_cleaning.transform import ( - DataCleaningTransformConfiguration, - dataclean_data_access_key, - dataclean_data_factory_key, - duplicate_list_location_default, - duplicate_list_location_key, -) from data_processing.data_access import DataAccessFactoryBase from data_processing.transform import TransformStatistics from data_processing.utils import get_logger @@ -28,6 +21,13 @@ SparkTransformLauncher, SparkTransformRuntimeConfiguration, ) +from dpk_fdedup.data_cleaning.transform import ( + DataCleaningTransformConfiguration, + dataclean_data_access_key, + dataclean_data_factory_key, + duplicate_list_location_default, + duplicate_list_location_key, +) logger = get_logger(__name__) @@ -43,7 +43,10 @@ def __init__(self, params: dict[str, Any]): self.logger = get_logger(__name__) def get_transform_config( - self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics + self, + partition: int, + data_access_factory: DataAccessFactoryBase, + statistics: TransformStatistics, ) -> dict[str, Any]: """ Download the table of duplicate document ids that will be provided to the @@ -108,7 +111,11 @@ def get_bcast_params(self, data_access_factory: DataAccessFactoryBase) -> dict[s """ data_access = data_access_factory.create_data_access() duplicate_list_location = os.path.abspath( - os.path.join(data_access.output_folder, "..", self.transform_config.params["duplicate_list_location"]) + os.path.join( + data_access.output_folder, + "..", + self.transform_config.params["duplicate_list_location"], + ) ) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") diff --git a/transforms/universal/fdedup/dpk_fdedup/data_cleaning/transform.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/transform.py index cb07923ae..bbda6a314 100644 --- a/transforms/universal/fdedup/dpk_fdedup/data_cleaning/transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/transform.py @@ -124,7 +124,6 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab class DataCleaningTransformConfiguration(TransformConfiguration): - """ Provides support for configuring and using the associated Transform class include configuration with CLI args. diff --git a/transforms/universal/fdedup/dpk_fdedup/data_cleaning/transform_python.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/transform_python.py index e29ef1218..d457ced4b 100644 --- a/transforms/universal/fdedup/dpk_fdedup/data_cleaning/transform_python.py +++ b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/transform_python.py @@ -13,13 +13,6 @@ import os from typing import Any -from dpk_fdedup.data_cleaning.transform import ( - DataCleaningTransformConfiguration, - dataclean_data_access_key, - dataclean_data_factory_key, - duplicate_list_location_default, - duplicate_list_location_key, -) from data_processing.data_access import DataAccessFactoryBase from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.runtime.pure_python.runtime_configuration import ( @@ -28,6 +21,13 @@ ) from data_processing.transform import TransformStatistics from data_processing.utils import get_logger +from dpk_fdedup.data_cleaning.transform import ( + DataCleaningTransformConfiguration, + dataclean_data_access_key, + dataclean_data_factory_key, + duplicate_list_location_default, + duplicate_list_location_key, +) logger = get_logger(__name__) @@ -43,7 +43,10 @@ def __init__(self, params: dict[str, Any]): self.logger = get_logger(__name__) def get_transform_config( - self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str] + self, + data_access_factory: DataAccessFactoryBase, + statistics: TransformStatistics, + files: list[str], ) -> dict[str, Any]: """ Download the table of duplicate document ids that will be provided to the diff --git a/transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/transform.py b/transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/transform.py index c14c4bdce..59c2ee5bb 100644 --- a/transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/transform.py @@ -121,7 +121,6 @@ def _consolidate_docs_to_remove_files(self, files: dict[str, bytes]) -> tuple[pl class GetDuplicateListTransformConfiguration(TransformConfiguration): - """ Provides support for configuring and using the associated Transform class include configuration with CLI args. diff --git a/transforms/universal/fdedup/dpk_fdedup/ray/transform.py b/transforms/universal/fdedup/dpk_fdedup/ray/transform.py index 76046ba4b..358667c63 100644 --- a/transforms/universal/fdedup/dpk_fdedup/ray/transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/ray/transform.py @@ -14,20 +14,24 @@ import os import sys -from dpk_fdedup.cluster_analysis.ray.transform import ClusterAnalysisRayTransformConfiguration -from dpk_fdedup.data_cleaning.ray.transform import DataCleaningRayTransformConfiguration from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from dpk_fdedup.transform_python import ServiceOrchestrator, parse_args -from dpk_fdedup.get_duplicate_list.transform_python import ( - GetDuplicateListPythonTransformConfiguration, +from dpk_fdedup.cluster_analysis.ray.transform import ( + ClusterAnalysisRayTransformConfiguration, ) +from dpk_fdedup.data_cleaning.ray.transform import DataCleaningRayTransformConfiguration from dpk_fdedup.get_duplicate_list.ray.transform import ( GetDuplicateListRayRuntime, GetDuplicateListRayTransformConfiguration, ) -from dpk_fdedup.signature_calc.ray.transform import SignatureCalculationRayTransformConfiguration +from dpk_fdedup.get_duplicate_list.transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) +from dpk_fdedup.signature_calc.ray.transform import ( + SignatureCalculationRayTransformConfiguration, +) +from dpk_fdedup.transform_python import ServiceOrchestrator, parse_args s3_creds = { diff --git a/transforms/universal/fdedup/dpk_fdedup/signature_calc/ray/local.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/ray/local.py index 2e5b7e2ab..6aa64c174 100644 --- a/transforms/universal/fdedup/dpk_fdedup/signature_calc/ray/local.py +++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/ray/local.py @@ -15,7 +15,9 @@ from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from dpk_fdedup.signature_calc.ray.transform import SignatureCalculationRayTransformConfiguration +from dpk_fdedup.signature_calc.ray.transform import ( + SignatureCalculationRayTransformConfiguration, +) # create parameters diff --git a/transforms/universal/fdedup/dpk_fdedup/signature_calc/ray/transform.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/ray/transform.py index 9a3b9f42f..79ee689b9 100644 --- a/transforms/universal/fdedup/dpk_fdedup/signature_calc/ray/transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/ray/transform.py @@ -15,7 +15,9 @@ RayTransformRuntimeConfiguration, ) from data_processing_ray.runtime.ray.transform_launcher import RayTransformLauncher -from dpk_fdedup.signature_calc.transform import SignatureCalculationTransformConfiguration +from dpk_fdedup.signature_calc.transform import ( + SignatureCalculationTransformConfiguration, +) logger = get_logger(__name__) diff --git a/transforms/universal/fdedup/dpk_fdedup/signature_calc/spark/transform.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/spark/transform.py index 9b2de7f28..415336b92 100644 --- a/transforms/universal/fdedup/dpk_fdedup/signature_calc/spark/transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/spark/transform.py @@ -15,7 +15,9 @@ SparkTransformLauncher, SparkTransformRuntimeConfiguration, ) -from dpk_fdedup.signature_calc.transform import SignatureCalculationTransformConfiguration +from dpk_fdedup.signature_calc.transform import ( + SignatureCalculationTransformConfiguration, +) logger = get_logger(__name__) diff --git a/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform.py index d01ee7b85..6d9b27d24 100644 --- a/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform.py @@ -361,7 +361,11 @@ def _write_band_signatures(self): # define shingles generation function def _generate_word_shingles( - self, row: tuple, shingling_option: str, window_size: int = 5, delimiter: str = " " + self, + row: tuple, + shingling_option: str, + window_size: int = 5, + delimiter: str = " ", ) -> tuple[list, int, int]: text = row[0] # lower case @@ -417,7 +421,6 @@ def _process_rows_into_bands(self, df, minhashlsh_num_bands, minhashlsh_length_b class SignatureCalculationTransformConfiguration(TransformConfiguration): - """ Provides support for configuring and using the associated Transform class include configuration with CLI args. diff --git a/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform_python.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform_python.py index c5a0db954..6172e401a 100644 --- a/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform_python.py +++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform_python.py @@ -17,7 +17,9 @@ PythonTransformRuntimeConfiguration, ) from data_processing.utils import get_logger -from dpk_fdedup.signature_calc.transform import SignatureCalculationTransformConfiguration +from dpk_fdedup.signature_calc.transform import ( + SignatureCalculationTransformConfiguration, +) logger = get_logger(__name__) diff --git a/transforms/universal/fdedup/dpk_fdedup/spark/transform.py b/transforms/universal/fdedup/dpk_fdedup/spark/transform.py index 77eff4d74..49493f1dd 100644 --- a/transforms/universal/fdedup/dpk_fdedup/spark/transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/spark/transform.py @@ -14,17 +14,21 @@ import os import sys -from dpk_fdedup.cluster_analysis.spark.transform import ClusterAnalysisSparkTransformConfiguration -from dpk_fdedup.data_cleaning.spark.transform import DataCleaningSparkTransformConfiguration from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing_spark.runtime.spark import SparkTransformLauncher -from dpk_fdedup.transform_python import ServiceOrchestrator, parse_args +from dpk_fdedup.cluster_analysis.spark.transform import ( + ClusterAnalysisSparkTransformConfiguration, +) +from dpk_fdedup.data_cleaning.spark.transform import ( + DataCleaningSparkTransformConfiguration, +) from dpk_fdedup.get_duplicate_list.transform_python import ( GetDuplicateListPythonTransformConfiguration, ) from dpk_fdedup.signature_calc.spark.transform import ( SignatureCalculationSparkTransformConfiguration, ) +from dpk_fdedup.transform_python import ServiceOrchestrator, parse_args s3_creds = { diff --git a/transforms/universal/fdedup/dpk_fdedup/transform_python.py b/transforms/universal/fdedup/dpk_fdedup/transform_python.py index 196affd93..6a7669db0 100644 --- a/transforms/universal/fdedup/dpk_fdedup/transform_python.py +++ b/transforms/universal/fdedup/dpk_fdedup/transform_python.py @@ -19,12 +19,14 @@ import dpk_fdedup.data_cleaning.transform import dpk_fdedup.get_duplicate_list.transform import dpk_fdedup.signature_calc.transform +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils, get_logger, str2bool from dpk_fdedup.cluster_analysis.transform_python import ( ClusterAnalysisPythonTransformConfiguration, ) -from dpk_fdedup.data_cleaning.transform_python import DataCleaningPythonTransformConfiguration -from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.utils import ParamsUtils, get_logger, str2bool +from dpk_fdedup.data_cleaning.transform_python import ( + DataCleaningPythonTransformConfiguration, +) from dpk_fdedup.get_duplicate_list.transform_python import ( GetDuplicateListPythonTransformConfiguration, ) @@ -183,23 +185,41 @@ def parse_args() -> argparse.Namespace: help="operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents", ) parser.add_argument( - "--contents_column", type=str, required=False, help="name of the column that stores document text" + "--contents_column", + type=str, + required=False, + help="name of the column that stores document text", ) parser.add_argument( - "--document_id_column", type=str, required=False, help="name of the column that stores document ID" + "--document_id_column", + type=str, + required=False, + help="name of the column that stores document ID", ) parser.add_argument("--seed", type=int, required=False, help="seed of the random number generator") parser.add_argument( - "--num_permutations", type=int, required=False, help="number of permutations to use for minhash calculation" + "--num_permutations", + type=int, + required=False, + help="number of permutations to use for minhash calculation", ) parser.add_argument( - "--num_bands", type=int, required=False, help="number of bands to use for band hash calculation" + "--num_bands", + type=int, + required=False, + help="number of bands to use for band hash calculation", ) parser.add_argument( - "--num_minhashes_per_band", type=int, required=False, help="number of minhashes to use in each band" + "--num_minhashes_per_band", + type=int, + required=False, + help="number of minhashes to use in each band", ) parser.add_argument( - "--word_shingle_size", type=int, required=False, help="number of words included in one shingle" + "--word_shingle_size", + type=int, + required=False, + help="number of words included in one shingle", ) parser.add_argument( "--jaccard_similarity_threshold", @@ -261,8 +281,6 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() - - # Class used by the notebooks to ingest binary files and create parquet files class Fdedup: def __init__(self, **kwargs): diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index 6b1265cf8..b6c7684e8 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -142,7 +142,11 @@ def fuzzydedup( data_num_samples: int = -1, # orchestrator runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # columns used fdedup_contents_column: str = "contents", fdedup_document_id_column: str = "int_id_column", @@ -209,7 +213,10 @@ def fuzzydedup( """ # create clean_up task clean_up_task = cleanup_ray_op( - ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py index 15722c164..f9f2aebaa 100644 --- a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py +++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py @@ -20,7 +20,13 @@ def compute_common_params( num_permutations: int, # number of permutations (minhashes) per document n_samples: int, # files to sample for number of documents estimation ) -> NamedTuple( - "fdedup_params", [("num_segments", int), ("num_actors", str), ("actor_cpu", float), ("actor_memory", int)] + "fdedup_params", + [ + ("num_segments", int), + ("num_actors", str), + ("actor_cpu", float), + ("actor_memory", int), + ], ): """ Compute fuzzy dedup execution parameters common to all the transforms @@ -43,7 +49,13 @@ def compute_common_params( s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint} s3_config = KFPUtils.load_from_json(data_s3_config.replace("'", '"')) # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly - data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1) + data_access = DataAccessS3( + s3_credentials=s3_creds, + s3_config=s3_config, + d_sets=None, + checkpoint=False, + m_files=-1, + ) # sample input data sampling: dict[str, Any] sampling, _ = data_access.sample_input_data(n_samples=n_samples) @@ -108,7 +120,6 @@ def signature_calc_compute_execution_params( num_segments: int, # number of segments seed: int, # seed for the random number generator ) -> dict: - """ Compute fuzzy dedup execution parameters for signature calculation :param runtime_num_actors: number of actors computed by KFPUtils.default_compute_execution_params() @@ -172,7 +183,6 @@ def cluster_analysis_compute_execution_params( threshold: float, # threshold, num_segments: int, # number of segments ) -> dict: - """ Compute fuzzy dedup execution parameters for cluster analysis :param runtime_num_actors: number of actors computed by KFPUtils.default_compute_execution_params() diff --git a/transforms/universal/fdedup/test/test_cluster_analysis_transform_python.py b/transforms/universal/fdedup/test/test_cluster_analysis_transform_python.py index c14329703..e003cab5d 100644 --- a/transforms/universal/fdedup/test/test_cluster_analysis_transform_python.py +++ b/transforms/universal/fdedup/test/test_cluster_analysis_transform_python.py @@ -12,14 +12,14 @@ import os -from dpk_fdedup.cluster_analysis.transform import sort_output_cli_param -from dpk_fdedup.cluster_analysis.transform_python import ( - ClusterAnalysisPythonTransformConfiguration, -) from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) +from dpk_fdedup.cluster_analysis.transform import sort_output_cli_param +from dpk_fdedup.cluster_analysis.transform_python import ( + ClusterAnalysisPythonTransformConfiguration, +) class TestPythonClusterAnalysisTransform(AbstractTransformLauncherTest): diff --git a/transforms/universal/fdedup/test/test_cluster_analysis_transform_ray.py b/transforms/universal/fdedup/test/test_cluster_analysis_transform_ray.py index 5cfddfc65..078d79bd3 100644 --- a/transforms/universal/fdedup/test/test_cluster_analysis_transform_ray.py +++ b/transforms/universal/fdedup/test/test_cluster_analysis_transform_ray.py @@ -12,17 +12,19 @@ import os +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_ray.runtime.ray import RayTransformLauncher +from dpk_fdedup.cluster_analysis.ray.transform import ( + ClusterAnalysisRayTransformConfiguration, +) from dpk_fdedup.cluster_analysis.transform import ( jaccard_similarity_threshold_cli_param, num_bands_cli_param, num_segments_cli_param, sort_output_cli_param, ) -from dpk_fdedup.cluster_analysis.ray.transform import ClusterAnalysisRayTransformConfiguration -from data_processing.test_support.launch.transform_test import ( - AbstractTransformLauncherTest, -) -from data_processing_ray.runtime.ray import RayTransformLauncher class TestRayClusterAnalysisTransform(AbstractTransformLauncherTest): diff --git a/transforms/universal/fdedup/test/test_cluster_analysis_transform_spark.py b/transforms/universal/fdedup/test/test_cluster_analysis_transform_spark.py index 990b0cf7b..795bf658e 100644 --- a/transforms/universal/fdedup/test/test_cluster_analysis_transform_spark.py +++ b/transforms/universal/fdedup/test/test_cluster_analysis_transform_spark.py @@ -12,12 +12,14 @@ import os -from dpk_fdedup.cluster_analysis.transform import sort_output_cli_param -from dpk_fdedup.cluster_analysis.spark.transform import ClusterAnalysisSparkTransformConfiguration from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) from data_processing_spark.runtime.spark import SparkTransformLauncher +from dpk_fdedup.cluster_analysis.spark.transform import ( + ClusterAnalysisSparkTransformConfiguration, +) +from dpk_fdedup.cluster_analysis.transform import sort_output_cli_param class TestSparkClusterAnalysisTransform(AbstractTransformLauncherTest): diff --git a/transforms/universal/fdedup/test/test_data_cleaning_transform_python.py b/transforms/universal/fdedup/test/test_data_cleaning_transform_python.py index faa5e8924..5f1fa973e 100644 --- a/transforms/universal/fdedup/test/test_data_cleaning_transform_python.py +++ b/transforms/universal/fdedup/test/test_data_cleaning_transform_python.py @@ -12,14 +12,16 @@ import os +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) from dpk_fdedup.data_cleaning.transform import ( document_id_column_cli_param, duplicate_list_location_cli_param, ) -from dpk_fdedup.data_cleaning.transform_python import DataCleaningPythonTransformConfiguration -from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.test_support.launch.transform_test import ( - AbstractTransformLauncherTest, +from dpk_fdedup.data_cleaning.transform_python import ( + DataCleaningPythonTransformConfiguration, ) @@ -45,5 +47,12 @@ def get_test_transform_fixtures(self) -> list[tuple]: duplicate_list_location_cli_param: duplicate_location, } launcher = PythonTransformLauncher(DataCleaningPythonTransformConfiguration()) - fixtures = [(launcher, config, basedir + "/input", basedir + "/expected/data_cleaning/cleaned")] + fixtures = [ + ( + launcher, + config, + basedir + "/input", + basedir + "/expected/data_cleaning/cleaned", + ) + ] return fixtures diff --git a/transforms/universal/fdedup/test/test_data_cleaning_transform_ray.py b/transforms/universal/fdedup/test/test_data_cleaning_transform_ray.py index 960127e51..1471d1f54 100644 --- a/transforms/universal/fdedup/test/test_data_cleaning_transform_ray.py +++ b/transforms/universal/fdedup/test/test_data_cleaning_transform_ray.py @@ -12,16 +12,16 @@ import os +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_ray.runtime.ray import RayTransformLauncher +from dpk_fdedup.data_cleaning.ray.transform import DataCleaningRayTransformConfiguration from dpk_fdedup.data_cleaning.transform import ( document_id_column_cli_param, duplicate_list_location_cli_param, operation_mode_cli_param, ) -from dpk_fdedup.data_cleaning.ray.transform import DataCleaningRayTransformConfiguration -from data_processing.test_support.launch.transform_test import ( - AbstractTransformLauncherTest, -) -from data_processing_ray.runtime.ray import RayTransformLauncher class TestRayDataCleaningTransform(AbstractTransformLauncherTest): diff --git a/transforms/universal/fdedup/test/test_data_cleaning_transform_spark.py b/transforms/universal/fdedup/test/test_data_cleaning_transform_spark.py index 9639980b3..58b6784ab 100644 --- a/transforms/universal/fdedup/test/test_data_cleaning_transform_spark.py +++ b/transforms/universal/fdedup/test/test_data_cleaning_transform_spark.py @@ -12,16 +12,18 @@ import os +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_spark.runtime.spark import SparkTransformLauncher +from dpk_fdedup.data_cleaning.spark.transform import ( + DataCleaningSparkTransformConfiguration, +) from dpk_fdedup.data_cleaning.transform import ( document_id_column_cli_param, duplicate_list_location_cli_param, operation_mode_cli_param, ) -from dpk_fdedup.data_cleaning.spark.transform import DataCleaningSparkTransformConfiguration -from data_processing.test_support.launch.transform_test import ( - AbstractTransformLauncherTest, -) -from data_processing_spark.runtime.spark import SparkTransformLauncher class TestSparkDataCleaningTransform(AbstractTransformLauncherTest): diff --git a/transforms/universal/fdedup/test/test_get_duplicate_list_transform_ray.py b/transforms/universal/fdedup/test/test_get_duplicate_list_transform_ray.py index 017e560b0..45747da22 100644 --- a/transforms/universal/fdedup/test/test_get_duplicate_list_transform_ray.py +++ b/transforms/universal/fdedup/test/test_get_duplicate_list_transform_ray.py @@ -16,8 +16,10 @@ AbstractTransformLauncherTest, ) from data_processing_ray.runtime.ray import RayTransformLauncher +from dpk_fdedup.get_duplicate_list.ray.transform import ( + GetDuplicateListRayTransformConfiguration, +) from dpk_fdedup.get_duplicate_list.transform import sort_output_cli_param -from dpk_fdedup.get_duplicate_list.ray.transform import GetDuplicateListRayTransformConfiguration class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest): diff --git a/transforms/universal/fdedup/test/test_signature_calc_transform_python.py b/transforms/universal/fdedup/test/test_signature_calc_transform_python.py index 2b6c49e31..9d57397d0 100644 --- a/transforms/universal/fdedup/test/test_signature_calc_transform_python.py +++ b/transforms/universal/fdedup/test/test_signature_calc_transform_python.py @@ -36,5 +36,12 @@ def get_test_transform_fixtures(self) -> list[tuple]: "minhash_num_segments": 2, } launcher = PythonTransformLauncher(SignatureCalculationPythonTransformConfiguration()) - fixtures = [(launcher, config, basedir + "/input/", basedir + "/expected/signature_calc/")] + fixtures = [ + ( + launcher, + config, + basedir + "/input/", + basedir + "/expected/signature_calc/", + ) + ] return fixtures diff --git a/transforms/universal/fdedup/test/test_signature_calc_transform_ray.py b/transforms/universal/fdedup/test/test_signature_calc_transform_ray.py index 8c08eb938..a392e0d07 100644 --- a/transforms/universal/fdedup/test/test_signature_calc_transform_ray.py +++ b/transforms/universal/fdedup/test/test_signature_calc_transform_ray.py @@ -17,12 +17,14 @@ ) from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher +from dpk_fdedup.signature_calc.ray.transform import ( + SignatureCalculationRayTransformConfiguration, +) from dpk_fdedup.signature_calc.transform import ( num_bands_cli_param, num_permutations_cli_param, num_segments_cli_param, ) -from dpk_fdedup.signature_calc.ray.transform import SignatureCalculationRayTransformConfiguration class TestRaySignatureCalcTransform(AbstractTransformLauncherTest): @@ -41,6 +43,11 @@ def get_test_transform_fixtures(self) -> list[tuple]: } launcher = RayTransformLauncher(SignatureCalculationRayTransformConfiguration()) fixtures = [ - (launcher, config, os.path.join(basedir, "input"), os.path.join(basedir, "expected", "signature_calc")) + ( + launcher, + config, + os.path.join(basedir, "input"), + os.path.join(basedir, "expected", "signature_calc"), + ) ] return fixtures diff --git a/transforms/universal/fdedup/test/test_signature_calc_transform_spark.py b/transforms/universal/fdedup/test/test_signature_calc_transform_spark.py index af8f36aa9..41be123c2 100644 --- a/transforms/universal/fdedup/test/test_signature_calc_transform_spark.py +++ b/transforms/universal/fdedup/test/test_signature_calc_transform_spark.py @@ -37,6 +37,11 @@ def get_test_transform_fixtures(self) -> list[tuple]: } launcher = SparkTransformLauncher(SignatureCalculationSparkTransformConfiguration()) fixtures = [ - (launcher, config, os.path.join(basedir, "input"), os.path.join(basedir, "expected", "signature_calc")) + ( + launcher, + config, + os.path.join(basedir, "input"), + os.path.join(basedir, "expected", "signature_calc"), + ) ] return fixtures diff --git a/transforms/universal/filter/dpk_filter/local_python.py b/transforms/universal/filter/dpk_filter/local_python.py index 18745f50a..1bd7a191d 100644 --- a/transforms/universal/filter/dpk_filter/local_python.py +++ b/transforms/universal/filter/dpk_filter/local_python.py @@ -15,12 +15,13 @@ from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils +from dpk_filter.transform_python import FilterPythonTransformConfiguration + from transforms.universal.filter.dpk_filter.transform import ( filter_columns_to_drop_cli_param, filter_criteria_cli_param, filter_logical_operator_cli_param, ) -from dpk_filter.transform_python import FilterPythonTransformConfiguration # create parameters diff --git a/transforms/universal/filter/dpk_filter/ray/local.py b/transforms/universal/filter/dpk_filter/ray/local.py index 2d2165b89..165d04943 100644 --- a/transforms/universal/filter/dpk_filter/ray/local.py +++ b/transforms/universal/filter/dpk_filter/ray/local.py @@ -15,12 +15,12 @@ from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher +from dpk_filter.ray.transform import FilterRayTransformConfiguration from dpk_filter.transform import ( filter_columns_to_drop_cli_param, filter_criteria_cli_param, filter_logical_operator_cli_param, ) -from dpk_filter.ray.transform import FilterRayTransformConfiguration # create parameters diff --git a/transforms/universal/filter/dpk_filter/ray/s3.py b/transforms/universal/filter/dpk_filter/ray/s3.py index a30ff0c4d..885511802 100644 --- a/transforms/universal/filter/dpk_filter/ray/s3.py +++ b/transforms/universal/filter/dpk_filter/ray/s3.py @@ -14,12 +14,12 @@ from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher +from dpk_filter.ray.transform import FilterRayTransformConfiguration from dpk_filter.transform import ( filter_columns_to_drop_cli_param, filter_criteria_cli_param, filter_logical_operator_cli_param, ) -from dpk_filter.ray.transform import FilterRayTransformConfiguration # create parameters diff --git a/transforms/universal/filter/dpk_filter/spark/local.py b/transforms/universal/filter/dpk_filter/spark/local.py index b683a626f..7deaab2c3 100644 --- a/transforms/universal/filter/dpk_filter/spark/local.py +++ b/transforms/universal/filter/dpk_filter/spark/local.py @@ -15,12 +15,12 @@ from data_processing.utils import ParamsUtils from data_processing_spark.runtime.spark import SparkTransformLauncher +from dpk_filter.spark.transform import FilterSparkTransformConfiguration from dpk_filter.transform import ( filter_columns_to_drop_cli_param, filter_criteria_cli_param, filter_logical_operator_cli_param, ) -from dpk_filter.spark.transform import FilterSparkTransformConfiguration # create parameters diff --git a/transforms/universal/filter/dpk_filter/spark/transform.py b/transforms/universal/filter/dpk_filter/spark/transform.py index 66a7dfcc9..e6bdf8a72 100644 --- a/transforms/universal/filter/dpk_filter/spark/transform.py +++ b/transforms/universal/filter/dpk_filter/spark/transform.py @@ -11,8 +11,10 @@ ################################################################################ from data_processing.utils import get_logger -from data_processing_spark.runtime.spark import SparkTransformLauncher -from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration +from data_processing_spark.runtime.spark import ( + SparkTransformLauncher, + SparkTransformRuntimeConfiguration, +) from dpk_filter.transform import FilterTransformConfiguration diff --git a/transforms/universal/filter/dpk_filter/transform_python.py b/transforms/universal/filter/dpk_filter/transform_python.py index 26ada468a..a80d71c69 100644 --- a/transforms/universal/filter/dpk_filter/transform_python.py +++ b/transforms/universal/filter/dpk_filter/transform_python.py @@ -10,12 +10,12 @@ # limitations under the License. ################################################################################ import sys + from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.runtime.pure_python.runtime_configuration import ( PythonTransformRuntimeConfiguration, ) -from data_processing.utils import get_logger, ParamsUtils - +from data_processing.utils import ParamsUtils, get_logger from dpk_filter.transform import FilterTransformConfiguration diff --git a/transforms/universal/filter/kfp_ray/filter_wf.py b/transforms/universal/filter/kfp_ray/filter_wf.py index e40fff775..9c14eff3c 100644 --- a/transforms/universal/filter/kfp_ray/filter_wf.py +++ b/transforms/universal/filter/kfp_ray/filter_wf.py @@ -29,6 +29,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -103,7 +104,14 @@ def filtering( ray_name: str = "filter-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access data_s3_config: str = "{'input_folder': 'test/filter/input/', 'output_folder': 'test/filter/output/'}", @@ -111,9 +119,13 @@ def filtering( data_max_files: int = -1, data_num_samples: int = -1, # orchestrator - runtime_actor_options: dict = {'num_cpus': 0.8}, + runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # filtering parameters filter_criteria_list: str = "['docq_total_words > 100 AND docq_total_words < 200', 'ibmkenlm_docq_perplex_score < 230']", filter_logical_operator: str = "AND", @@ -160,7 +172,12 @@ def filtering( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): diff --git a/transforms/universal/filter/test/test_filter.py b/transforms/universal/filter/test/test_filter.py index 228e981ba..a819860e7 100644 --- a/transforms/universal/filter/test/test_filter.py +++ b/transforms/universal/filter/test/test_filter.py @@ -56,7 +56,12 @@ def create_filter_test_fixture( with open(os.path.join(expected_output_dir, "metadata.json"), "r") as meta_file: expected_metadata = json.load(meta_file) expected_metadata_list = [expected_metadata, {}] - return FilterTransform(config), [input_df], [expected_output_df], expected_metadata_list + return ( + FilterTransform(config), + [input_df], + [expected_output_df], + expected_metadata_list, + ) def get_test_transform_fixtures(self) -> list[tuple]: fixtures = [] diff --git a/transforms/universal/filter/test/test_filter_ray.py b/transforms/universal/filter/test/test_filter_ray.py index ba32d136f..68a696df9 100644 --- a/transforms/universal/filter/test/test_filter_ray.py +++ b/transforms/universal/filter/test/test_filter_ray.py @@ -14,8 +14,8 @@ from data_processing.runtime import AbstractTransformLauncher from data_processing_ray.runtime.ray import RayTransformLauncher -from dpk_filter.test_support import AbstractPythonFilterTransformTest from dpk_filter.ray.transform import FilterRayTransformConfiguration +from dpk_filter.test_support import AbstractPythonFilterTransformTest class TestPythonFilterTransform(AbstractPythonFilterTransformTest): @@ -29,4 +29,7 @@ def _get_test_file_directory(self) -> str: return dir def _get_launcher(self) -> (AbstractTransformLauncher, dict): - return (RayTransformLauncher(FilterRayTransformConfiguration()), {"run_locally": True}) + return ( + RayTransformLauncher(FilterRayTransformConfiguration()), + {"run_locally": True}, + ) diff --git a/transforms/universal/filter/test/test_filter_spark.py b/transforms/universal/filter/test/test_filter_spark.py index ab4d50c12..13b7c394d 100644 --- a/transforms/universal/filter/test/test_filter_spark.py +++ b/transforms/universal/filter/test/test_filter_spark.py @@ -13,8 +13,8 @@ from data_processing.runtime import AbstractTransformLauncher from data_processing_spark.runtime.spark import SparkTransformLauncher -from dpk_filter.test_support import AbstractPythonFilterTransformTest from dpk_filter.spark.transform import FilterSparkTransformConfiguration +from dpk_filter.test_support import AbstractPythonFilterTransformTest class TestSparkFilterTransform1(AbstractPythonFilterTransformTest): diff --git a/transforms/universal/hap/dpk_hap/transform.py b/transforms/universal/hap/dpk_hap/transform.py index a3ac274f0..392359328 100644 --- a/transforms/universal/hap/dpk_hap/transform.py +++ b/transforms/universal/hap/dpk_hap/transform.py @@ -50,7 +50,11 @@ def _apply_model(self, data: list, batch_size: int) -> list[float]: if start_idx >= end_idx: continue inputs = self.tokenizer( - data[start_idx:end_idx], max_length=self.max_length, padding=True, truncation=True, return_tensors="pt" + data[start_idx:end_idx], + max_length=self.max_length, + padding=True, + truncation=True, + return_tensors="pt", ).to(device) with torch.no_grad(): logits = self.model(**inputs).logits diff --git a/transforms/universal/hap/kfp_ray/hap_wf.py b/transforms/universal/hap/kfp_ray/hap_wf.py index 2008f091e..d1eb77935 100644 --- a/transforms/universal/hap/kfp_ray/hap_wf.py +++ b/transforms/universal/hap/kfp_ray/hap_wf.py @@ -28,6 +28,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -129,7 +130,11 @@ def hap( # orchestrator runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # hap parameters model_name_or_path: str = "ibm-granite/granite-guardian-hap-38m", annotation_column: str = "hap_score", @@ -183,7 +188,10 @@ def hap( """ # create clean_up task clean_up_task = cleanup_ray_op( - ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition diff --git a/transforms/universal/hap/test/test_hap.py b/transforms/universal/hap/test/test_hap.py index e0a455064..02a214c52 100644 --- a/transforms/universal/hap/test/test_hap.py +++ b/transforms/universal/hap/test/test_hap.py @@ -47,6 +47,11 @@ def get_test_transform_fixtures(self) -> list[tuple]: # table_list, metadata = transform.transform(table) fixtures = [ - (HAPTransform(hap_params), input_tables, expected_tables, expected_metadata_list), + ( + HAPTransform(hap_params), + input_tables, + expected_tables, + expected_metadata_list, + ), ] return fixtures diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py index 3b102d205..d20ae2a77 100644 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -28,6 +28,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -98,7 +99,14 @@ def noop( ray_name: str = "noop-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access data_s3_config: str = "[{'input_folder': 'test/noop/input/', 'output_folder': 'test/noop/output/'}]", @@ -106,9 +114,13 @@ def noop( data_max_files: int = -1, data_num_samples: int = -1, # orchestrator - runtime_actor_options: dict = {'num_cpus': 0.8}, + runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # noop parameters noop_sleep_sec: int = 10, # additional parameters @@ -151,7 +163,12 @@ def noop( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index e8125328b..a725b66e6 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -14,7 +14,6 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl - from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils @@ -29,6 +28,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -100,7 +100,14 @@ def noop( ray_name: str = "noop-kfp-ray", # name of Ray cluster # Add image_pull_secret, image_pull_policy and tolerations to ray options if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access data_s3_config: str = "{'input_folder': 'test/noop/input/', 'output_folder': 'test/noop/output/'}", @@ -109,9 +116,13 @@ def noop( data_num_samples: int = -1, data_checkpointing: bool = False, # orchestrator - runtime_actor_options: dict = {'num_cpus': 0.8}, + runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # noop parameters noop_sleep_sec: int = 10, # additional parameters @@ -155,7 +166,10 @@ def noop( """ # create clean_up task clean_up_task = cleanup_ray_op( - ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition @@ -199,6 +213,7 @@ def noop( ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) execute_job.after(ray_cluster) + if __name__ == "__main__": # Compiling the pipeline - compiler.Compiler().compile(noop, __file__.replace(".py", ".yaml")) \ No newline at end of file + compiler.Compiler().compile(noop, __file__.replace(".py", ".yaml")) diff --git a/transforms/universal/noop/python/src/noop_transform.py b/transforms/universal/noop/python/src/noop_transform.py index a4bf5bb5c..075d9995b 100644 --- a/transforms/universal/noop/python/src/noop_transform.py +++ b/transforms/universal/noop/python/src/noop_transform.py @@ -63,7 +63,6 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab class NOOPTransformConfiguration(TransformConfiguration): - """ Provides support for configuring and using the associated Transform class include configuration with CLI args. diff --git a/transforms/universal/noop/python/test/test_noop.py b/transforms/universal/noop/python/test/test_noop.py index 73ec18b31..24cd69afb 100644 --- a/transforms/universal/noop/python/test/test_noop.py +++ b/transforms/universal/noop/python/test/test_noop.py @@ -21,7 +21,10 @@ table = pa.Table.from_pydict({"name": pa.array(["Tom"]), "age": pa.array([23])}) expected_table = table # We're a noop after all. -expected_metadata_list = [{"nfiles": 1, "nrows": 1}, {}] # transform() result # flush() result +expected_metadata_list = [ + {"nfiles": 1, "nrows": 1}, + {}, +] # transform() result # flush() result class TestNOOPTransform(AbstractTableTransformTest): @@ -39,6 +42,11 @@ def get_test_transform_fixtures(self) -> list[tuple]: expected_metadata_list = [{"nfiles": 1, "nrows": 7}, {}] config = {sleep_key: 0} fixtures = [ - (NOOPTransform(config), input_tables, expected_tables, expected_metadata_list), + ( + NOOPTransform(config), + input_tables, + expected_tables, + expected_metadata_list, + ), ] return fixtures diff --git a/transforms/universal/noop/spark/src/noop_transform_spark.py b/transforms/universal/noop/spark/src/noop_transform_spark.py index d6b5fd2ba..a8638ec97 100644 --- a/transforms/universal/noop/spark/src/noop_transform_spark.py +++ b/transforms/universal/noop/spark/src/noop_transform_spark.py @@ -10,7 +10,10 @@ # limitations under the License. ################################################################################ from data_processing.utils import get_logger -from data_processing_spark.runtime.spark import SparkTransformLauncher, SparkTransformRuntimeConfiguration +from data_processing_spark.runtime.spark import ( + SparkTransformLauncher, + SparkTransformRuntimeConfiguration, +) from noop_transform import NOOPTransformConfiguration diff --git a/transforms/universal/profiler/kfp_ray/profiler_wf.py b/transforms/universal/profiler/kfp_ray/profiler_wf.py index 914637895..214d50e05 100644 --- a/transforms/universal/profiler/kfp_ray/profiler_wf.py +++ b/transforms/universal/profiler/kfp_ray/profiler_wf.py @@ -74,7 +74,14 @@ def profiler( ray_name: str = "profiler-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access. checkpointing is not supported by dedup data_s3_config: str = "{'input_folder': 'test/profiler/input/', 'output_folder': 'test/profiler/output'}", @@ -84,7 +91,11 @@ def profiler( # orchestrator runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # profiler profiler_aggregator_cpu: float = 0.5, profiler_doc_column: str = "contents", @@ -132,7 +143,12 @@ def profiler( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): diff --git a/transforms/universal/profiler/kfp_ray/src/profiler_compute_execution_params.py b/transforms/universal/profiler/kfp_ray/src/profiler_compute_execution_params.py index a5483eec7..71920ff70 100644 --- a/transforms/universal/profiler/kfp_ray/src/profiler_compute_execution_params.py +++ b/transforms/universal/profiler/kfp_ray/src/profiler_compute_execution_params.py @@ -69,7 +69,13 @@ def profiler_compute_execution_params( # S3 config is list. take the first element s3_config = s3_config[0] # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly - data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1) + data_access = DataAccessS3( + s3_credentials=s3_creds, + s3_config=s3_config, + d_sets=None, + checkpoint=False, + m_files=-1, + ) # sample input data sampling, _ = data_access.sample_input_data(n_samples=n_samples) avg_doc_size = sampling.get("average doc size KB") diff --git a/transforms/universal/profiler/python/src/profiler_local.py b/transforms/universal/profiler/python/src/profiler_local.py index cf9e0711f..f8c65d153 100644 --- a/transforms/universal/profiler/python/src/profiler_local.py +++ b/transforms/universal/profiler/python/src/profiler_local.py @@ -12,10 +12,9 @@ import os -from data_processing.data_access import DataAccessLocal, DataAccessFactory -from profiler_transform_base import DataAggregator +from data_processing.data_access import DataAccessFactory, DataAccessLocal +from profiler_transform_base import DataAggregator, doc_column_name_key from profiler_transform_python import ProfilerTransform -from profiler_transform_base import doc_column_name_key # create parameters @@ -26,8 +25,10 @@ "output_folder": output_folder, } -profiler_params = {doc_column_name_key: "contents", - "aggregator": DataAggregator({"data_access_factory": DataAccessFactory()})} +profiler_params = { + doc_column_name_key: "contents", + "aggregator": DataAggregator({"data_access_factory": DataAccessFactory()}), +} if __name__ == "__main__": # Here we show how to run outside of ray diff --git a/transforms/universal/profiler/python/src/profiler_local_python.py b/transforms/universal/profiler/python/src/profiler_local_python.py index 18f032679..7f970da4b 100644 --- a/transforms/universal/profiler/python/src/profiler_local_python.py +++ b/transforms/universal/profiler/python/src/profiler_local_python.py @@ -15,8 +15,8 @@ from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils -from profiler_transform_python import ProfilerPythonTransformRuntimeConfiguration from profiler_transform_base import doc_column_name_cli_param +from profiler_transform_python import ProfilerPythonTransformRuntimeConfiguration # create launcher diff --git a/transforms/universal/profiler/python/src/profiler_transform_base.py b/transforms/universal/profiler/python/src/profiler_transform_base.py index c28e57bfa..b391307d2 100644 --- a/transforms/universal/profiler/python/src/profiler_transform_base.py +++ b/transforms/universal/profiler/python/src/profiler_transform_base.py @@ -18,7 +18,12 @@ import pyarrow as pa from data_processing.transform import AbstractTableTransform, TransformConfiguration -from data_processing.utils import GB, CLIArgumentProvider, TransformUtils, UnrecoverableException +from data_processing.utils import ( + GB, + CLIArgumentProvider, + TransformUtils, + UnrecoverableException, +) short_name = "profiler" @@ -150,6 +155,7 @@ def __init__(self, transform_class: type[AbstractTableTransform], print_config: transform_class=transform_class, ) from data_processing.utils import get_logger + self.logger = get_logger(__name__) self.print_config = print_config @@ -161,7 +167,8 @@ def add_input_params(self, parser: ArgumentParser) -> None: f"--{doc_column_name_cli_param}", type=str, default="contents", - help="key for accessing data") + help="key for accessing data", + ) def apply_input_params(self, args: Namespace) -> bool: """ diff --git a/transforms/universal/profiler/python/src/profiler_transform_python.py b/transforms/universal/profiler/python/src/profiler_transform_python.py index bdd5aaee7..375919e1d 100644 --- a/transforms/universal/profiler/python/src/profiler_transform_python.py +++ b/transforms/universal/profiler/python/src/profiler_transform_python.py @@ -18,12 +18,12 @@ PythonTransformLauncher, PythonTransformRuntimeConfiguration, ) -from data_processing.utils import UnrecoverableException from data_processing.transform import TransformStatistics +from data_processing.utils import UnrecoverableException from profiler_transform_base import ( + DataAggregator, ProfilerTransformBase, ProfilerTransformConfigurationBase, - DataAggregator, ) @@ -59,12 +59,16 @@ class ProfilerRuntime(DefaultPythonTransformRuntime): def __init__(self, params: dict[str, Any]): from data_processing.utils import get_logger + super().__init__(params=params) self.aggregator = None self.logger = get_logger(__name__) def get_transform_config( - self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str] + self, + data_access_factory: DataAccessFactoryBase, + statistics: TransformStatistics, + files: list[str], ) -> dict[str, Any]: """ Get the dictionary of configuration that will be provided to the transform's initializer. diff --git a/transforms/universal/profiler/python/test/test_profiler.py b/transforms/universal/profiler/python/test/test_profiler.py index a174f68b9..715151f87 100644 --- a/transforms/universal/profiler/python/test/test_profiler.py +++ b/transforms/universal/profiler/python/test/test_profiler.py @@ -13,12 +13,11 @@ import os from typing import Tuple +from data_processing.data_access import DataAccessFactory from data_processing.test_support import get_tables_in_folder from data_processing.test_support.transform import AbstractTableTransformTest -from profiler_transform_base import DataAggregator +from profiler_transform_base import DataAggregator, doc_column_name_key from profiler_transform_python import ProfilerTransform -from profiler_transform_base import doc_column_name_key -from data_processing.data_access import DataAccessFactory class TestProfilerTransform(AbstractTableTransformTest): @@ -32,9 +31,16 @@ def get_test_transform_fixtures(self) -> list[Tuple]: input_dir = os.path.join(basedir, "input") input_tables = get_tables_in_folder(input_dir) expected_metadata_list = [{}, {}] - config = {doc_column_name_key: "contents", - "aggregator": DataAggregator({"data_access_factory": DataAccessFactory()})} + config = { + doc_column_name_key: "contents", + "aggregator": DataAggregator({"data_access_factory": DataAccessFactory()}), + } expected_tables = [] return [ - (ProfilerTransform(config), input_tables, expected_tables, expected_metadata_list), + ( + ProfilerTransform(config), + input_tables, + expected_tables, + expected_metadata_list, + ), ] diff --git a/transforms/universal/profiler/python/test/test_profiler_python.py b/transforms/universal/profiler/python/test/test_profiler_python.py index 2fb8df22f..aa622bc24 100644 --- a/transforms/universal/profiler/python/test/test_profiler_python.py +++ b/transforms/universal/profiler/python/test/test_profiler_python.py @@ -11,13 +11,13 @@ ################################################################################ import os -from data_processing.test_support import get_files_in_folder from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support import get_files_in_folder from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) -from profiler_transform_python import ProfilerPythonTransformRuntimeConfiguration from profiler_transform_base import doc_column_name_cli_param +from profiler_transform_python import ProfilerPythonTransformRuntimeConfiguration class TestPythonProfilerTransform(AbstractTransformLauncherTest): @@ -49,4 +49,3 @@ def _validate_directory_contents_match(self, produced: str, expected: str, ignor f_set1 = get_files_in_folder(dir=produced, ext=".json", return_data=False) f_set2 = get_files_in_folder(dir=expected, ext=".json", return_data=False) assert len(f_set1) == len(f_set2) - diff --git a/transforms/universal/profiler/ray/src/profiler_transform_ray.py b/transforms/universal/profiler/ray/src/profiler_transform_ray.py index 98944fae9..01175bacb 100644 --- a/transforms/universal/profiler/ray/src/profiler_transform_ray.py +++ b/transforms/universal/profiler/ray/src/profiler_transform_ray.py @@ -15,7 +15,11 @@ import ray from data_processing.data_access import DataAccessFactoryBase -from data_processing.utils import CLIArgumentProvider, TransformUtils, UnrecoverableException +from data_processing.utils import ( + CLIArgumentProvider, + TransformUtils, + UnrecoverableException, +) from data_processing_ray.runtime.ray import ( DefaultRayTransformRuntime, RayTransformLauncher, @@ -24,13 +28,13 @@ from data_processing_ray.runtime.ray.runtime_configuration import ( RayTransformRuntimeConfiguration, ) -from ray.actor import ActorHandle from profiler_transform_base import ( DataAggregator, ProfilerTransformBase, ProfilerTransformConfigurationBase, - cli_prefix + cli_prefix, ) +from ray.actor import ActorHandle class ProfilerTransform(ProfilerTransformBase): @@ -95,7 +99,10 @@ def __init__(self, params: dict[str, Any]): self.logger = get_logger(__name__) def get_transform_config( - self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str] + self, + data_access_factory: DataAccessFactoryBase, + statistics: ActorHandle, + files: list[str], ) -> dict[str, Any]: """ Set environment for transform execution @@ -148,7 +155,10 @@ def compute_execution_stats(self, stats: dict[str, Any]) -> dict[str, Any]: remote_replies = not_ready if retries > 0: stats["data access retries"] = stats.get("data access retries", 0) + retries - return {"unique words": sum_aggregators, "words memory, GB": sum_aggregator_mem} | stats + return { + "unique words": sum_aggregators, + "words memory, GB": sum_aggregator_mem, + } | stats class ProfilerRayTransformConfiguration(ProfilerTransformConfigurationBase): @@ -172,13 +182,13 @@ def add_input_params(self, parser: ArgumentParser) -> None: f"--{cli_prefix}aggregator_cpu", type=float, default=0.5, - help="number of CPUs per aggregator" + help="number of CPUs per aggregator", ) parser.add_argument( f"--{cli_prefix}num_aggregators", type=int, default=0, - help="number of aggregator actors to use" + help="number of aggregator actors to use", ) def apply_input_params(self, args: Namespace) -> bool: @@ -201,7 +211,10 @@ def apply_input_params(self, args: Namespace) -> bool: class ProfilerRayTransformRuntimeConfiguration(RayTransformRuntimeConfiguration): def __init__(self): - super().__init__(transform_config=ProfilerRayTransformConfiguration(), runtime_class=ProfilerRuntime) + super().__init__( + transform_config=ProfilerRayTransformConfiguration(), + runtime_class=ProfilerRuntime, + ) if __name__ == "__main__": diff --git a/transforms/universal/profiler/ray/test/test_profiler_ray.py b/transforms/universal/profiler/ray/test/test_profiler_ray.py index 04c75a6c9..963a93d8b 100644 --- a/transforms/universal/profiler/ray/test/test_profiler_ray.py +++ b/transforms/universal/profiler/ray/test/test_profiler_ray.py @@ -54,8 +54,7 @@ def _validate_directory_contents_match(self, produced: str, expected: str, ignor expected_len += os.path.getsize(f_set2[i]) assert abs(produced_len - expected_len) < 500 - - # Compare metadata + # Compare metadata f_set1 = get_files_in_folder(dir=produced, ext=".json", return_data=False) f_set2 = get_files_in_folder(dir=expected, ext=".json", return_data=False) assert len(f_set1) == len(f_set2) diff --git a/transforms/universal/profiler/spark/src/profiler_local_spark.py b/transforms/universal/profiler/spark/src/profiler_local_spark.py index dad113a60..0d8b7735e 100644 --- a/transforms/universal/profiler/spark/src/profiler_local_spark.py +++ b/transforms/universal/profiler/spark/src/profiler_local_spark.py @@ -15,8 +15,8 @@ from data_processing.utils import ParamsUtils from data_processing_spark.runtime.spark import SparkTransformLauncher -from profiler_transform_spark import ProfilerSparkTransformRuntimeConfiguration from profiler_transform_base import doc_column_name_cli_param +from profiler_transform_spark import ProfilerSparkTransformRuntimeConfiguration # create parameters diff --git a/transforms/universal/profiler/spark/src/profiler_transform_spark.py b/transforms/universal/profiler/spark/src/profiler_transform_spark.py index 2466c1785..30884d84e 100644 --- a/transforms/universal/profiler/spark/src/profiler_transform_spark.py +++ b/transforms/universal/profiler/spark/src/profiler_transform_spark.py @@ -13,11 +13,19 @@ from typing import Any from data_processing.data_access import DataAccessFactoryBase -from data_processing.utils import UnrecoverableException from data_processing.transform import TransformStatistics -from data_processing_spark.runtime.spark import SparkTransformLauncher -from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration, DefaultSparkTransformRuntime -from profiler_transform_base import ProfilerTransformBase, ProfilerTransformConfigurationBase, DataAggregator +from data_processing.utils import UnrecoverableException +from data_processing_spark.runtime.spark import ( + DefaultSparkTransformRuntime, + SparkTransformLauncher, + SparkTransformRuntimeConfiguration, +) +from profiler_transform_base import ( + DataAggregator, + ProfilerTransformBase, + ProfilerTransformConfigurationBase, +) + class ProfilerSparkTransform(ProfilerTransformBase): """ @@ -62,12 +70,16 @@ def __init__(self, params: dict[str, Any]): num_aggregators - number of aggregators """ from data_processing.utils import get_logger + super().__init__(params=params) self.aggregator = None self.logger = get_logger(__name__) def get_transform_config( - self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics + self, + partition: int, + data_access_factory: DataAccessFactoryBase, + statistics: TransformStatistics, ) -> dict[str, Any]: """ Get the dictionary of configuration that will be provided to the transform's initializer. @@ -82,7 +94,6 @@ def get_transform_config( self.aggregator = DataAggregator({"data_access_factory": data_access_factory}) return self.params | {"aggregator": self.aggregator} - def compute_execution_stats(self, stats: TransformStatistics) -> None: """ Update/augment the given statistics object with runtime-specific additions/modifications. @@ -99,8 +110,10 @@ def compute_execution_stats(self, stats: TransformStatistics) -> None: class ProfilerSparkTransformRuntimeConfiguration(SparkTransformRuntimeConfiguration): def __init__(self): - super().__init__(transform_config=ProfilerTransformConfigurationBase(transform_class=ProfilerSparkTransform), - runtime_class=ProfilerRuntime) + super().__init__( + transform_config=ProfilerTransformConfigurationBase(transform_class=ProfilerSparkTransform), + runtime_class=ProfilerRuntime, + ) if __name__ == "__main__": diff --git a/transforms/universal/profiler/spark/test/test_profiler_spark.py b/transforms/universal/profiler/spark/test/test_profiler_spark.py index 99e64a8df..ee7632c3f 100644 --- a/transforms/universal/profiler/spark/test/test_profiler_spark.py +++ b/transforms/universal/profiler/spark/test/test_profiler_spark.py @@ -12,12 +12,12 @@ import os from data_processing.test_support import get_files_in_folder -from data_processing_spark.runtime.spark import SparkTransformLauncher from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) -from profiler_transform_spark import ProfilerSparkTransformRuntimeConfiguration +from data_processing_spark.runtime.spark import SparkTransformLauncher from profiler_transform_base import doc_column_name_cli_param +from profiler_transform_spark import ProfilerSparkTransformRuntimeConfiguration class TestPythonProfilerTransform(AbstractTransformLauncherTest): @@ -44,9 +44,7 @@ def _validate_directory_contents_match(self, produced: str, expected: str, ignor expected_len += os.path.getsize(f_set2[i]) assert abs(produced_len - expected_len) < 500 - - # Compare metadata + # Compare metadata f_set1 = get_files_in_folder(dir=produced, ext=".json", return_data=False) f_set2 = get_files_in_folder(dir=expected, ext=".json", return_data=False) assert len(f_set1) == len(f_set2) - diff --git a/transforms/universal/resize/kfp_ray/resize_wf.py b/transforms/universal/resize/kfp_ray/resize_wf.py index 0724ed731..b6c1ae3ea 100644 --- a/transforms/universal/resize/kfp_ray/resize_wf.py +++ b/transforms/universal/resize/kfp_ray/resize_wf.py @@ -27,6 +27,7 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different transforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -107,7 +108,14 @@ def resize( ray_name: str = "resize-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access data_s3_config: str = "{'input_folder': 'test/resize/input/', 'output_folder': 'test/resize/output/'}", @@ -118,9 +126,13 @@ def resize( data_data_sets: str = "", data_files_to_use: str = "['.parquet']", # orchestrator - runtime_actor_options: dict = {'num_cpus': 0.8}, + runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # doc id parameters resize_max_rows_per_table: int = 20, resize_max_mbytes_per_table: int = -1, @@ -167,7 +179,12 @@ def resize( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): diff --git a/transforms/universal/resize/python/src/resize_transform.py b/transforms/universal/resize/python/src/resize_transform.py index 90ec8308c..2a6fe8d56 100644 --- a/transforms/universal/resize/python/src/resize_transform.py +++ b/transforms/universal/resize/python/src/resize_transform.py @@ -134,7 +134,6 @@ def flush(self) -> tuple[list[pa.Table], dict[str, Any]]: class ResizeTransformConfiguration(TransformConfiguration): - """ Provides support for configuring and using the associated Transform class include configuration with CLI args and combining of metadata. diff --git a/transforms/universal/resize/python/test/test_resize.py b/transforms/universal/resize/python/test/test_resize.py index 1d63a34c1..8a99e79ce 100644 --- a/transforms/universal/resize/python/test/test_resize.py +++ b/transforms/universal/resize/python/test/test_resize.py @@ -34,28 +34,70 @@ def get_test_transform_fixtures(self) -> list[Tuple]: config = {"max_rows_per_table": 300} expected_tables = get_tables_in_folder(os.path.join(basedir, "expected-rows-300")) - fixtures.append((ResizeTransform(config), input_tables, expected_tables, expected_metadata_list)) + fixtures.append( + ( + ResizeTransform(config), + input_tables, + expected_tables, + expected_metadata_list, + ) + ) config = {"max_rows_per_table": 125} expected_tables = get_tables_in_folder(os.path.join(basedir, "expected-rows-125")) - fixtures.append((ResizeTransform(config), input_tables, expected_tables, expected_metadata_list)) + fixtures.append( + ( + ResizeTransform(config), + input_tables, + expected_tables, + expected_metadata_list, + ) + ) config = {"max_mbytes_per_table": 0.05} expected_tables = get_tables_in_folder(os.path.join(basedir, "expected-mbytes-0.05")) - fixtures.append((ResizeTransform(config), input_tables, expected_tables, expected_metadata_list)) + fixtures.append( + ( + ResizeTransform(config), + input_tables, + expected_tables, + expected_metadata_list, + ) + ) config = {"max_mbytes_per_table": 1} expected_tables = get_tables_in_folder(os.path.join(basedir, "expected-mbytes-1")) - fixtures.append((ResizeTransform(config), input_tables, expected_tables, expected_metadata_list)) + fixtures.append( + ( + ResizeTransform(config), + input_tables, + expected_tables, + expected_metadata_list, + ) + ) # # Merge the 1st 2 and some of the 2nd with the 3rd config = {"max_mbytes_per_table": 0.05} expected_tables = get_tables_in_folder(os.path.join(basedir, "expected-mbytes-0.05")) - fixtures.append((ResizeTransform(config), input_tables, expected_tables, expected_metadata_list)) + fixtures.append( + ( + ResizeTransform(config), + input_tables, + expected_tables, + expected_metadata_list, + ) + ) # Split into 4 or so files config = {"max_mbytes_per_table": 0.02} expected_tables = get_tables_in_folder(os.path.join(basedir, "expected-mbytes-0.02")) - fixtures.append((ResizeTransform(config), input_tables, expected_tables, expected_metadata_list)) + fixtures.append( + ( + ResizeTransform(config), + input_tables, + expected_tables, + expected_metadata_list, + ) + ) return fixtures diff --git a/transforms/universal/resize/ray/test/test_resize_launch.py b/transforms/universal/resize/ray/test/test_resize_launch.py index 5d3ff01d9..f13d72136 100644 --- a/transforms/universal/resize/ray/test/test_resize_launch.py +++ b/transforms/universal/resize/ray/test/test_resize_launch.py @@ -27,7 +27,10 @@ class TestRayResizeTransform(AbstractTransformLauncherTest): def get_test_transform_fixtures(self) -> list[tuple]: # The following based on 3 identical input files of about 39kbytes, and 200 rows - common_config = {"runtime_num_workers": 1, "run_locally": True} # to make the output files repeatable. + common_config = { + "runtime_num_workers": 1, + "run_locally": True, + } # to make the output files repeatable. fixtures = [] basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) launcher = RayTransformLauncher(ResizeRayTransformConfiguration()) diff --git a/transforms/universal/resize/spark/src/resize_transform_spark.py b/transforms/universal/resize/spark/src/resize_transform_spark.py index 17f2804ab..b6aeae0b9 100644 --- a/transforms/universal/resize/spark/src/resize_transform_spark.py +++ b/transforms/universal/resize/spark/src/resize_transform_spark.py @@ -10,7 +10,10 @@ # limitations under the License. ################################################################################ from data_processing.utils import get_logger -from data_processing_spark.runtime.spark import SparkTransformLauncher, SparkTransformRuntimeConfiguration +from data_processing_spark.runtime.spark import ( + SparkTransformLauncher, + SparkTransformRuntimeConfiguration, +) from resize_transform import ResizeTransformConfiguration diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py index c9fb6f2e9..cb04157db 100644 --- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py +++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py @@ -129,7 +129,11 @@ def tokenization( # orchestrator runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, + runtime_code_location: dict = { + "github": "github", + "commit_hash": "12345", + "path": "path", + }, # tokenizer parameters tkn_tokenizer: str = "hf-internal-testing/llama-tokenizer", tkn_doc_id_column: str = "document_id", @@ -183,7 +187,10 @@ def tokenization( """ # create clean_up task clean_up_task = cleanup_ray_op( - ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ray_name=ray_name, + run_id=run_id, + server_url=server_url, + additional_params=additional_params, ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition diff --git a/transforms/universal/tokenization/test/test_tokenization.py b/transforms/universal/tokenization/test/test_tokenization.py index 3cb53a047..bfee5f6ed 100644 --- a/transforms/universal/tokenization/test/test_tokenization.py +++ b/transforms/universal/tokenization/test/test_tokenization.py @@ -41,7 +41,12 @@ """ expected output table as per HF's `bigcode/starcoder` tokenizer: """ -tokens = pa.array([[1, 910, 2793, 338, 363, 1574, 29900, 29896], [1, 7280, 1842, 2793, 338, 363, 1574, 29900, 29941]]) +tokens = pa.array( + [ + [1, 910, 2793, 338, 363, 1574, 29900, 29896], + [1, 7280, 1842, 2793, 338, 363, 1574, 29900, 29941], + ] +) document_id = pa.array(["doc01", "doc03"]) document_length = pa.array([25, 37]) token_count = pa.array([8, 9]) @@ -62,7 +67,14 @@ expected output metadata: """ expected_metadata_list = [ - {"num_files": 1, "num_rows": 3, "num_tokenized_rows": 2, "num_empty_rows": 1, "num_tokens": 17, "num_chars": 62}, + { + "num_files": 1, + "num_rows": 3, + "num_tokenized_rows": 2, + "num_empty_rows": 1, + "num_tokens": 17, + "num_chars": 62, + }, {}, ] @@ -88,6 +100,11 @@ class TestTokenizationTransform(AbstractTableTransformTest): def get_test_transform_fixtures(self) -> list[Tuple]: fixtures = [ - (TokenizationTransform(config), [table], [expected_table], expected_metadata_list), + ( + TokenizationTransform(config), + [table], + [expected_table], + expected_metadata_list, + ), ] return fixtures diff --git a/transforms/universal/web2parquet/dpk_web2parquet/config.py b/transforms/universal/web2parquet/dpk_web2parquet/config.py index 16584cb57..211285f66 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/config.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/config.py @@ -13,10 +13,10 @@ from argparse import ArgumentParser, Namespace from data_processing.transform import TransformConfiguration -from data_processing.utils import CLIArgumentProvider -from data_processing.utils import get_logger +from data_processing.utils import CLIArgumentProvider, get_logger from dpk_web2parquet.transform import Web2ParquetTransform + short_name = "web2parquet" cli_prefix = f"{short_name}_" urls_cli_param = f"{cli_prefix}urls" @@ -25,20 +25,17 @@ folder_cli_param = f"{cli_prefix}folder" -logger = get_logger(__name__,"DEBUG") - -class Web2ParquetTransformConfiguration(TransformConfiguration): +logger = get_logger(__name__, "DEBUG") + +class Web2ParquetTransformConfiguration(TransformConfiguration): """ Provides support for configuring and using the associated Transform class include configuration with CLI args. """ def __init__(self): - super().__init__( - name=short_name, - transform_class=Web2ParquetTransform - ) + super().__init__(name=short_name, transform_class=Web2ParquetTransform) def add_input_params(self, parser: ArgumentParser) -> None: """ @@ -47,16 +44,28 @@ def add_input_params(self, parser: ArgumentParser) -> None: By convention a common prefix should be used for all transform-specific CLI args (e.g, noop_, pii_, etc.) """ - parser.add_argument(f"--{depth_cli_param}", type=int, default=1, + parser.add_argument( + f"--{depth_cli_param}", + type=int, + default=1, help="maxumum depth relative to seed URL", ) - parser.add_argument(f"--{downloads_cli_param}", type=int, default=1, + parser.add_argument( + f"--{downloads_cli_param}", + type=int, + default=1, help="maxumum number of downloaded URLs", ) - parser.add_argument(f"--{folder_cli_param}", type=str, default=None, + parser.add_argument( + f"--{folder_cli_param}", + type=str, + default=None, help="Folder where to store downloaded files", ) - parser.add_argument(f"--{urls_cli_param}", type=str, default=None, + parser.add_argument( + f"--{urls_cli_param}", + type=str, + default=None, help="List of Seed URLs for the crawler", ) @@ -74,8 +83,3 @@ def apply_input_params(self, args: Namespace) -> bool: self.params = self.params | captured logger.info(f"web2parquet parameters are : {self.params}") return True - - - - - diff --git a/transforms/universal/web2parquet/dpk_web2parquet/local.py b/transforms/universal/web2parquet/dpk_web2parquet/local.py index cc0b8956d..c2b628909 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/local.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/local.py @@ -13,14 +13,13 @@ from dpk_web2parquet.transform import Web2Parquet + # create parameters if __name__ == "__main__": # Here we show how to run outside of the runtime # Create and configure the transform. - transform = Web2Parquet(urls= ['https://thealliance.ai/'], - depth=1, - downloads=1) + transform = Web2Parquet(urls=["https://thealliance.ai/"], depth=1, downloads=1) table_list, metadata = transform.transform() - #print(f"\noutput table: {table_list}") - print(f"output metadata : {metadata}") \ No newline at end of file + # print(f"\noutput table: {table_list}") + print(f"output metadata : {metadata}") diff --git a/transforms/universal/web2parquet/dpk_web2parquet/local_python.py b/transforms/universal/web2parquet/dpk_web2parquet/local_python.py index 735f0eb02..2f2d75b6a 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/local_python.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/local_python.py @@ -1,4 +1,4 @@ -#(C) Copyright IBM Corp. 2024. +# (C) Copyright IBM Corp. 2024. # Licensed under the Apache License, Version 2.0 (the “License”); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -19,7 +19,7 @@ # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..","test-data","input")) +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "output")) local_conf = { "input_folder": input_folder, @@ -34,7 +34,7 @@ "runtime_job_id": "job_id", "runtime_code_location": ParamsUtils.convert_to_ast(code_location), # web2parquet params - "web2parquet_urls": 'https://thealliance.ai/', + "web2parquet_urls": "https://thealliance.ai/", "web2parquet_depth": 1, "web2parquet_downloads": 1, } diff --git a/transforms/universal/web2parquet/dpk_web2parquet/python_runtime.py b/transforms/universal/web2parquet/dpk_web2parquet/python_runtime.py index 6b2acdfc5..158aa740d 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/python_runtime.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/python_runtime.py @@ -41,4 +41,4 @@ def __init__(self): if __name__ == "__main__": launcher = PythonTransformLauncher(Web2ParquetPythonTransformConfiguration()) logger.info("Launching web2parquet transform") - launcher.launch() \ No newline at end of file + launcher.launch() diff --git a/transforms/universal/web2parquet/dpk_web2parquet/transform.py b/transforms/universal/web2parquet/dpk_web2parquet/transform.py index 5cd402fa2..fe28ad962 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/transform.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/transform.py @@ -21,21 +21,20 @@ from dpk_web2parquet.utils import * - user_agent = "Mozilla/5.0 (X11; Linux i686; rv:125.0) Gecko/20100101 Firefox/125.0" -logger = get_logger(__name__,"DEBUG") +logger = get_logger(__name__, "DEBUG") + class Web2ParquetTransform(AbstractTableTransform): """ Crawl the web and load content to pyarrow Table. """ - def __init__(self, config: dict[str, Any]): """ Initialize based on the dictionary of configuration information. - example: + example: kwargs = {'urls': ['https://thealliance.ai/'],'depth': 1,'downloads': 1} Web2ParquetTransform(**kwargs) or @@ -48,13 +47,16 @@ def __init__(self, config: dict[str, Any]): self.seed_urls = config.get("urls", []) self.depth = config.get("depth", 1) self.downloads = config.get("downloads", 10) - self.allow_mime_types = config.get("mime_types", ["application/pdf","text/html","text/markdown","text/plain"]) - self.folder=config.get('folder', None) + self.allow_mime_types = config.get( + "mime_types", + ["application/pdf", "text/html", "text/markdown", "text/plain"], + ) + self.folder = config.get("folder", None) assert self.seed_urls, "Must specify a list of URLs to crawl. Url cannot be None" ## users may be tempted to provide a single URLs, we still need to put it in a list of 1 if type(self.seed_urls) is not list: - self.seed_urls=[self.seed_urls] + self.seed_urls = [self.seed_urls] self.count = 0 self.docs = [] @@ -64,18 +66,17 @@ def on_download(self, url: str, body: bytes, headers: dict) -> None: Callback function called when a page has been downloaded. You have access to the request URL, response body and headers. """ - doc=get_file_info(url, headers) - doc['url'] = url - doc['contents'] = body - + doc = get_file_info(url, headers) + doc["url"] = url + doc["contents"] = body + logger.debug(f"url: {doc['url']}, filename: {doc['filename']}, content_type: {doc['content_type']}") ## Enforce download limits if len(self.docs) < self.downloads: self.docs.append(doc) - - def transform(self, table: pa.Table=None, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: + def transform(self, table: pa.Table = None, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: """ Put Transform-specific to convert one Table to 0 or more tables. It also returns a dictionary of execution statistics - arbitrary dictionary @@ -89,31 +90,28 @@ def transform(self, table: pa.Table=None, file_name: str = None) -> tuple[list[p user_agent=user_agent, depth_limit=self.depth, download_limit=self.downloads, - allow_mime_types=self.allow_mime_types + allow_mime_types=self.allow_mime_types, ) # blocking call - - end_time = time.time() + end_time = time.time() table = pa.Table.from_pylist(self.docs) metadata = { "count": len(self.docs), "requested_seeds": len(self.seed_urls), "requested_depth": self.depth, - "requested_downloads": self. downloads, - } - logger.info(f"Crawling is completed in {end_time - start_time:.2f} seconds") + "requested_downloads": self.downloads, + } + logger.info(f"Crawling is completed in {end_time - start_time:.2f} seconds") logger.info(f"{metadata = }") ############################################################################# ## The same transform can also be used to store crawled files to local folder if self.folder: - dao=DataAccessLocal(local_config={'output_folder':self.folder,'input_folder':'.'}) + dao = DataAccessLocal(local_config={"output_folder": self.folder, "input_folder": "."}) for x in self.docs: - dao.save_file(self.folder+'/'+x['filename'], x['contents']) - + dao.save_file(self.folder + "/" + x["filename"], x["contents"]) + return [table], metadata - - class Web2Parquet(Web2ParquetTransform): @@ -123,4 +121,3 @@ class Web2Parquet(Web2ParquetTransform): def __init__(self, **kwargs): super().__init__(dict(kwargs)) - diff --git a/transforms/universal/web2parquet/dpk_web2parquet/utils.py b/transforms/universal/web2parquet/dpk_web2parquet/utils.py index 8214cc817..7ffada03c 100644 --- a/transforms/universal/web2parquet/dpk_web2parquet/utils.py +++ b/transforms/universal/web2parquet/dpk_web2parquet/utils.py @@ -13,26 +13,25 @@ from urllib.parse import urlparse -def get_file_info(url: str, headers: dict=None): + +def get_file_info(url: str, headers: dict = None): try: - file_size = int(headers['Content-Length']) + file_size = int(headers["Content-Length"]) except: - file_size=0 + file_size = 0 try: - content_type=headers.get('Content-Type') + content_type = headers.get("Content-Type") except: - content_type='text/html' - - url_parse=urlparse(url) + content_type = "text/html" + + url_parse = urlparse(url) try: - filename = headers.get('Content-Disposition').split('filename=')[1].strip().strip('"') + filename = headers.get("Content-Disposition").split("filename=")[1].strip().strip('"') except: - filename='-'.join(url_parse.path.strip('/').split('/')) - # Prepend host name - filename=url_parse.netloc.replace('.',"_")+'_'+filename - - # append extension using content type - filename = filename+"_"+content_type.split(';')[0].replace("/", ".") - return {'filename':filename, 'content_type': content_type, 'file_size': file_size} - + filename = "-".join(url_parse.path.strip("/").split("/")) + # Prepend host name + filename = url_parse.netloc.replace(".", "_") + "_" + filename + # append extension using content type + filename = filename + "_" + content_type.split(";")[0].replace("/", ".") + return {"filename": filename, "content_type": content_type, "file_size": file_size} diff --git a/transforms/universal/web2parquet/test/test_web2parquet.py b/transforms/universal/web2parquet/test/test_web2parquet.py index da99d168d..a416e4e4f 100644 --- a/transforms/universal/web2parquet/test/test_web2parquet.py +++ b/transforms/universal/web2parquet/test/test_web2parquet.py @@ -32,16 +32,18 @@ def get_test_transform_fixtures(self) -> list[tuple]: launcher = PythonTransformLauncher(Web2ParquetPythonTransformConfiguration()) input_dir = os.path.join(src_file_dir, "../test-data/input") expected_dir = os.path.join(src_file_dir, "../test-data/expected") - transform_config = {"web2parquet_urls": 'https://thealliance.ai/', + transform_config = { + "web2parquet_urls": "https://thealliance.ai/", "web2parquet_depth": 1, - "web2parquet_downloads": 1} + "web2parquet_downloads": 1, + } fixtures.append( ( launcher, transform_config, input_dir, expected_dir, - ['contents'], # optional list of column names to ignore in comparing test-generated with expected. + ["contents"], # optional list of column names to ignore in comparing test-generated with expected. ) )