PpCalculation: Make parsing of output files optional (aiidateam#1029)

The `parse_data_files` option is added. When switched to `False` the parser will not parse the outputs files but just keep the raw files. The existing option `keep_plot_file` is deprecated in favor of the renamed `keep_data_files` option to make it coherent with the new option.
bastonero · Jan 6, 2025 · 70b38f6 · 70b38f6
1 parent 2c564e2
commit 70b38f6
Show file tree

Hide file tree

Showing 4 changed files with 128 additions and 47 deletions.
diff --git a/src/aiida_quantumespresso/calculations/pp.py b/src/aiida_quantumespresso/calculations/pp.py
@@ -1,9 +1,11 @@
 # -*- coding: utf-8 -*-
 """`CalcJob` implementation for the pp.x code of Quantum ESPRESSO."""
 import os
+import warnings
 
 from aiida import orm
 from aiida.common import datastructures, exceptions
+from aiida.common.warnings import AiidaDeprecationWarning
 
 from aiida_quantumespresso.calculations import _lowercase_dict, _uppercase_dict
 from aiida_quantumespresso.utils.convert import convert_input_to_namelist_entry
@@ -82,7 +84,9 @@ def define(cls, spec):
         spec.input('metadata.options.output_filename', valid_type=str, default=cls._DEFAULT_OUTPUT_FILE)
         spec.input('metadata.options.parser_name', valid_type=str, default='quantumespresso.pp')
         spec.input('metadata.options.withmpi', valid_type=bool, default=True)
-        spec.input('metadata.options.keep_plot_file', valid_type=bool, default=False)
+        spec.input('metadata.options.keep_plot_file', valid_type=bool, required=False)
+        spec.input('metadata.options.keep_data_files', valid_type=bool, default=False)
+        spec.input('metadata.options.parse_data_files', valid_type=bool, default=True)
 
         spec.output('output_parameters', valid_type=orm.Dict)
         spec.output('output_data', valid_type=orm.ArrayData)
@@ -218,10 +222,16 @@ def prepare_for_submission(self, folder):  # pylint: disable=too-many-branches,t
         # distinguish them from one another. The `fileout` filename will be the full data filename with the `fileout`
         # value as a suffix.
         retrieve_tuples = [self._FILEOUT, (f'{self._FILPLOT}_*{self._FILEOUT}', '.', 0)]
-
-        if self.inputs.metadata.options.keep_plot_file:
+        if 'keep_plot_file' in self.inputs.metadata.options:
+            self.inputs.metadata.options.keep_data_files = self.inputs.metadata.options.keep_plot_file
+            warnings.warn(
+                "The input parameter 'keep_plot_file' is deprecated and will be removed in version 5.0.0. "
+                "Please use 'keep_data_files' instead.", AiidaDeprecationWarning
+            )
+        if self.inputs.metadata.options.keep_data_files:
             calcinfo.retrieve_list.extend(retrieve_tuples)
-        else:
+        # If we do not want to parse the retrieved files, temporary retrieval is meaningless
+        elif self.inputs.metadata.options.parse_data_files:
             calcinfo.retrieve_temporary_list.extend(retrieve_tuples)
 
         return calcinfo
diff --git a/src/aiida_quantumespresso/parsers/pp.py b/src/aiida_quantumespresso/parsers/pp.py
@@ -117,35 +117,35 @@ def get_key_from_filename(filename):
             matches = re.search(pattern, filename)
             return matches.group(1)
 
-        for filename in filenames:
-            # Directly parse the retrieved files after reading them to memory (`data_raw`). The raw data
-            # of each file is released from memory after parsing, to improve memory usage.
-            if filename.endswith(filename_suffix):
-                # Read the file to memory
-                try:
-                    with file_opener(filename) as handle:
-                        data_raw = handle.read()
-                except OSError:
-                    return self.exit_codes.ERROR_OUTPUT_DATAFILE_READ.format(filename=filename)
-                # Parse the file
-                try:
-                    key = get_key_from_filename(filename)
-                    data_parsed.append((key, parsers[iflag](data_raw, self.units_dict[parsed_data['plot_num']])))
-                    del data_raw
-                except Exception as exception:  # pylint: disable=broad-except
-                    return self.exit_codes.ERROR_OUTPUT_DATAFILE_PARSE.format(filename=filename, exception=exception)
-
-        # If we don't have any parsed files, we exit. Note that this will not catch the case where there should be more
-        # than one file, but the engine did not retrieve all of them. Since often we anyway don't know how many files
-        # should be retrieved there really is no way to check this explicitly.
-        if not data_parsed:
-            return self.exit_codes.ERROR_OUTPUT_DATAFILE_MISSING.format(filename=filename_prefix)
-
-        # Create output nodes
-        if len(data_parsed) == 1:
-            self.out('output_data', data_parsed[0][1])
-        else:
-            self.out('output_data_multiple', dict(data_parsed))
+        if self.node.base.attributes.get('parse_data_files'):
+            for filename in filenames:
+                # Directly parse the retrieved files after reading them to memory (`data_raw`). The raw data
+                # of each file is released from memory after parsing, to improve memory usage.
+                if filename.endswith(filename_suffix):
+                    # Read the file to memory
+                    try:
+                        with file_opener(filename) as handle:
+                            data_raw = handle.read()
+                    except OSError:
+                        return self.exit_codes.ERROR_OUTPUT_DATAFILE_READ.format(filename=filename)
+                    # Parse the file
+                    try:
+                        key = get_key_from_filename(filename)
+                        data_parsed.append((key, parsers[iflag](data_raw, self.units_dict[parsed_data['plot_num']])))
+                        del data_raw
+                    except Exception as exception:  # pylint: disable=broad-except
+                        return self.exit_codes.ERROR_OUTPUT_DATAFILE_PARSE.format(filename=filename, exception=exception)
+
+            # If we don't have any parsed files, we exit. Note that this will not catch the case where there should be more
+            # than one file, but the engine did not retrieve all of them. Since often we anyway don't know how many files
+            # should be retrieved there really is no way to check this explicitly.
+            if not data_parsed:
+                return self.exit_codes.ERROR_OUTPUT_DATAFILE_MISSING.format(filename=filename_prefix)
+
+            if len(data_parsed) == 1:
+                self.out('output_data', data_parsed[0][1])
+            else:
+                self.out('output_data_multiple', dict(data_parsed))
 
         return self.exit(logs=logs)
 

diff --git a/tests/calculations/test_pp.py b/tests/calculations/test_pp.py
@@ -60,11 +60,11 @@ def test_pp_default(fixture_sandbox, generate_calc_job, generate_inputs, file_re
     file_regression.check(input_written, encoding='utf-8', extension='.in')
 
 
-def test_pp_keep_plot_file(fixture_sandbox, generate_calc_job, generate_inputs):
+def test_pp_keep_data_files(fixture_sandbox, generate_calc_job, generate_inputs):
     """Test a `PpCalculation` where we want to retrieve the plot file."""
     entry_point_name = 'quantumespresso.pp'
     inputs = generate_inputs()
-    inputs.metadata.options.keep_plot_file = True
+    inputs.metadata.options.keep_data_files = True
 
     calc_info = generate_calc_job(fixture_sandbox, entry_point_name, inputs)
     retrieve_list = ['aiida.out', 'aiida.fileout', ('aiida.filplot_*aiida.fileout', '.', 0)]
@@ -80,6 +80,26 @@ def test_pp_keep_plot_file(fixture_sandbox, generate_calc_job, generate_inputs):
         assert element in calc_info.retrieve_list
 
 
+def test_pp_parse_data_files(fixture_sandbox, generate_calc_job, generate_inputs):
+    """Test a `PpCalculation` where we want to retrieve the plot file."""
+    entry_point_name = 'quantumespresso.pp'
+    inputs = generate_inputs()
+    inputs.metadata.options.parse_data_files = False
+
+    calc_info = generate_calc_job(fixture_sandbox, entry_point_name, inputs)
+    retrieve_list = ['aiida.out']
+    retrieve_temporary_list = []
+    local_copy_list = []
+
+    # When both `keep_data_files` (default) and `parse_data_files` are set to False, the data files won't be pulled.
+    assert isinstance(calc_info, datastructures.CalcInfo)
+    assert sorted(calc_info.local_copy_list) == sorted(local_copy_list)
+    assert sorted(calc_info.retrieve_temporary_list) == sorted(retrieve_temporary_list)
+    assert len(calc_info.retrieve_list) == 1
+    for element in retrieve_list:
+        assert element in calc_info.retrieve_list
+
+
 def test_pp_cmdline_setting(fixture_sandbox, generate_calc_job, generate_inputs):
     """Test a `PpCalculation` with user-defined cmdline settings."""
     entry_point_name = 'quantumespresso.pp'

diff --git a/tests/parsers/test_pp.py b/tests/parsers/test_pp.py
@@ -125,7 +125,11 @@ def test_pp_default_1d(
     entry_point_calc_job = 'quantumespresso.pp'
     entry_point_parser = 'quantumespresso.pp'
 
-    node = generate_calc_job_node(entry_point_calc_job, fixture_localhost, 'default_1d', generate_inputs_1d)
+    attributes = {'keep_data_files': False, 'parse_data_files': True}
+
+    node = generate_calc_job_node(
+        entry_point_calc_job, fixture_localhost, 'default_1d', generate_inputs_1d, attributes=attributes
+    )
     parser = generate_parser(entry_point_parser)
     results, calcfunction = parser.parse_from_node(node, store_provenance=False)
 
@@ -157,9 +161,13 @@ def test_pp_default_1d_spherical(
     """Test a default `pp.x` calculation producing a 1D data set with spherical averaging."""
     entry_point_calc_job = 'quantumespresso.pp'
     entry_point_parser = 'quantumespresso.pp'
-
+    attributes = {'keep_data_files': False, 'parse_data_files': True}
     node = generate_calc_job_node(
-        entry_point_calc_job, fixture_localhost, 'default_1d_spherical', generate_inputs_1d_spherical
+        entry_point_calc_job,
+        fixture_localhost,
+        'default_1d_spherical',
+        generate_inputs_1d_spherical,
+        attributes=attributes
     )
     parser = generate_parser(entry_point_parser)
     results, calcfunction = parser.parse_from_node(node, store_provenance=False)
@@ -200,8 +208,11 @@ def test_pp_default_2d(
     """Test a default `pp.x` calculation producing a 2D data set."""
     entry_point_calc_job = 'quantumespresso.pp'
     entry_point_parser = 'quantumespresso.pp'
+    attributes = {'keep_data_files': False, 'parse_data_files': True}
 
-    node = generate_calc_job_node(entry_point_calc_job, fixture_localhost, 'default_2d', generate_inputs_2d)
+    node = generate_calc_job_node(
+        entry_point_calc_job, fixture_localhost, 'default_2d', generate_inputs_2d, attributes=attributes
+    )
     parser = generate_parser(entry_point_parser)
     results, calcfunction = parser.parse_from_node(node, store_provenance=False)
 
@@ -237,8 +248,11 @@ def test_pp_default_polar(
     """Test a default `pp.x` calculation producing a polar coordinates data set."""
     entry_point_calc_job = 'quantumespresso.pp'
     entry_point_parser = 'quantumespresso.pp'
+    attributes = {'keep_data_files': False, 'parse_data_files': True}
 
-    node = generate_calc_job_node(entry_point_calc_job, fixture_localhost, 'default_polar', generate_inputs_polar)
+    node = generate_calc_job_node(
+        entry_point_calc_job, fixture_localhost, 'default_polar', generate_inputs_polar, attributes=attributes
+    )
     parser = generate_parser(entry_point_parser)
     results, calcfunction = parser.parse_from_node(node, store_provenance=False)
 
@@ -267,8 +281,11 @@ def test_pp_default_3d(
     """Test a default `pp.x` calculation producing a 3D data set."""
     entry_point_calc_job = 'quantumespresso.pp'
     entry_point_parser = 'quantumespresso.pp'
+    attributes = {'keep_data_files': False, 'parse_data_files': True}
 
-    node = generate_calc_job_node(entry_point_calc_job, fixture_localhost, 'default_3d', generate_inputs_3d)
+    node = generate_calc_job_node(
+        entry_point_calc_job, fixture_localhost, 'default_3d', generate_inputs_3d, attributes=attributes
+    )
     parser = generate_parser(entry_point_parser)
     results, calcfunction = parser.parse_from_node(node, store_provenance=False)
 
@@ -297,12 +314,16 @@ def test_pp_default_3d(
     })
 
 
-def test_pp_default_3d_keep_plot_file(generate_calc_job_node, generate_parser, generate_inputs_3d, tmpdir):
-    """Test a `pp.x` calculation where `keep_plot_file=False` meaning files will be parsed from temporary directory."""
+def test_pp_default_3d_keep_data_files(generate_calc_job_node, generate_parser, generate_inputs_3d, tmpdir):
+    """Test a `pp.x` calculation where `keep_data_files=False` meaning files will be parsed from temporary directory."""
     entry_point_calc_job = 'quantumespresso.pp'
     entry_point_parser = 'quantumespresso.pp'
 
-    attributes = {'options': {'keep_plot_file': False}, 'retrieve_temporary_list': ['aiida.fileout']}
+    attributes = {
+        'keep_data_files': False,
+        'parse_data_files': True,
+        'retrieve_temporary_list': ['aiida.fileout'],
+    }
     node = generate_calc_job_node(
         entry_point_calc_job,
         test_name='default_3d',
@@ -320,12 +341,36 @@ def test_pp_default_3d_keep_plot_file(generate_calc_job_node, generate_parser, g
     assert len(results['output_data'].get_arraynames()) == 4
 
 
+def test_pp_default_3d_parse_data_files(generate_calc_job_node, generate_parser, generate_inputs_3d, tmpdir):
+    """Test a `pp.x` calculation where `parse_data_files=False`, so data files won't be parsed."""
+    entry_point_calc_job = 'quantumespresso.pp'
+    entry_point_parser = 'quantumespresso.pp'
+
+    attributes = {'keep_data_files': False, 'parse_data_files': False}
+    node = generate_calc_job_node(
+        entry_point_calc_job,
+        test_name='default_3d',
+        inputs=generate_inputs_3d,
+        attributes=attributes,
+    )
+    parser = generate_parser(entry_point_parser)
+    results, calcfunction = parser.parse_from_node(node, store_provenance=False, retrieved_temporary_folder=tmpdir)
+
+    assert calcfunction.is_finished, calcfunction.exception
+    assert calcfunction.is_finished_ok, calcfunction.exit_message
+    assert 'output_parameters' in results
+    assert 'output_data' not in results
+
+
 def test_pp_default_3d_multiple(generate_calc_job_node, generate_parser, generate_inputs_3d):
     """Test a default `pp.x` calculation producing multiple files in 3D format."""
     entry_point_calc_job = 'quantumespresso.pp'
     entry_point_parser = 'quantumespresso.pp'
+    attributes = {'keep_data_files': False, 'parse_data_files': True}
 
-    node = generate_calc_job_node(entry_point_calc_job, test_name='default_3d_multiple', inputs=generate_inputs_3d)
+    node = generate_calc_job_node(
+        entry_point_calc_job, test_name='default_3d_multiple', inputs=generate_inputs_3d, attributes=attributes
+    )
     parser = generate_parser(entry_point_parser)
     results, calcfunction = parser.parse_from_node(node, store_provenance=False)
 
@@ -364,9 +409,14 @@ def test_pp_default_3d_failed_missing_data(
     """Test a default `pp.x` calculation where the aiida.fileout file is missing."""
     entry_point_calc_job = 'quantumespresso.pp'
     entry_point_parser = 'quantumespresso.pp'
+    attributes = {'keep_data_files': False, 'parse_data_files': True}
 
     node = generate_calc_job_node(
-        entry_point_calc_job, fixture_localhost, 'default_3d_failed_missing_data', generate_inputs_3d
+        entry_point_calc_job,
+        fixture_localhost,
+        'default_3d_failed_missing_data',
+        generate_inputs_3d,
+        attributes=attributes
     )
     parser = generate_parser(entry_point_parser)
     _, calcfunction = parser.parse_from_node(node, store_provenance=False)
@@ -398,9 +448,10 @@ def test_pp_default_3d_failed_format(fixture_localhost, generate_calc_job_node,
     """Test a default `pp.x` calculation where an unsupported output file format is used."""
     entry_point_calc_job = 'quantumespresso.pp'
     entry_point_parser = 'quantumespresso.pp'
+    attributes = {'keep_data_files': False, 'parse_data_files': True}
 
     node = generate_calc_job_node(
-        entry_point_calc_job, fixture_localhost, 'default_3d_failed_format', generate_inputs_3d
+        entry_point_calc_job, fixture_localhost, 'default_3d_failed_format', generate_inputs_3d, attributes=attributes
     )
     parser = generate_parser(entry_point_parser)
     _, calcfunction = parser.parse_from_node(node, store_provenance=False)