parsing orca files - major revamp (#5)

* mod parse_orca.py to skip first output_block if len(output_blocks) > 1 * mod orca_file.py to read charge and multiplicity from SCF settings for compatibility with compound jobs. also annotate parse_orca.py to describe conditional for returning output blocks * mod orca_file.py to recognize run time and successful terminations in compound jobs * tidy orca_file.py by removing lines that were commented out * add TightOpt and OptTs to job types, comment out if len(enthalpies) > 0 to prevent errors on parsing jobs that don't converge * update orca_file.py and parse_orca.py to look for multiple frequency calcs and return the last one * update docs and untit tests to indicate incompatibility with MiniPrint * start new orca recipe * clean up OrcaJobTypes with value and expected properties accessed via tuple. If ScanTs is in header, add OPT to job_types. * update description of properties_list function in ensemble.py * update tutorial 1 to prevent error message about accessing molecules via file.molecule * update hello_world.py to prevent error * add ORCA recipe * parsing charges * add orca to tutorial_01 * format tutorial_01 * format tutorial_01 * format tutorial_01 * format tutorial_01 * update recipe 9 * update recipe 9 * update recipe 9 * add orca tests for frequencies and compound jobs * format recipe09 --------- Co-authored-by: Joe Gair <[email protected]>
ekwan · Apr 11, 2023 · d971a8f · d971a8f
1 parent e00e298
commit d971a8f
Show file tree

Hide file tree

Showing 28 changed files with 222,732 additions and 132 deletions.
diff --git a/cctk/ensemble.py b/cctk/ensemble.py
@@ -113,7 +113,7 @@ def molecule_list(self):
 
     def properties_list(self):
         """
-        Returns a list of the constituent molecules.
+        Returns a list of dictionaries. One dictionary per geometry. Each dictionary contains the property names and property values for each geometry in the ensemble. 
         """
         return list(self.values())
 

diff --git a/cctk/orca_file.py b/cctk/orca_file.py
@@ -17,35 +17,29 @@ class OrcaJobType(Enum):
     All jobs have type ``SP`` by default.
     """
 
-    SP = "sp"
+    SP = ("sp", ["energy", "scf_iterations"])
     """
     Single point energy calculation.
     """
 
-    OPT = "opt"
+    OPT = ("opt", ["rms_gradient", "rms_step", "max_gradient", "max_step"])
     """
     Geometry optimization.
     """
 
-    FREQ = "freq"
+    FREQ = ("freq", ["gibbs_free_energy", "enthalpy", "frequencies", "temperature"])
     """
     Hessian calculation.
     """
 
-    NMR = "nmr"
+    NMR = ("nmr", ["isotropic_shielding"])
     """
     NMR shielding prediction.
     """
 
-#### This static variable tells what properties are expected from each JobType.
-EXPECTED_PROPERTIES = {
-    "sp": ["energy", "scf_iterations",],
-#    "opt": ["rms_gradient", "rms_step", "max_gradient", "max_step"],
-    "opt": [],
-    "freq": ["gibbs_free_energy", "enthalpy", "frequencies", "temperature"],
-    "nmr": ["isotropic_shielding",],
-}
-
+    def __init__(self, value, expected_properties):
+        self._value_ = value
+        self.expected_properties = expected_properties
 
 class OrcaFile(File):
     """
@@ -107,19 +101,40 @@ def read_file(cls, filename):
             job_types = cls._assign_job_types(header)
             variables, blocks = parse.read_blocks_and_variables(input_lines)
 
-            success = 0
+            successful_scf_convergence = 0
+            successful_opt = 0
+            successful_freq = 0
+            successful_NMR_EPR = 0
+            is_scan_job = False
+            # add identifiers for successful termination of other job types
+
             elapsed_time = 0
-            for line in lines:
-                if line.strip().startswith("****ORCA TERMINATED NORMALLY****"):
-                    success += 1
-                elif line.startswith("TOTAL RUN TIME"):
+            for line in lines:        
+                if line.startswith("FINAL SINGLE POINT ENERGY"):                                        #### SCF converged at least once
+                    successful_scf_convergence += 1
+                elif line.strip().startswith("***        THE OPTIMIZATION HAS CONVERGED     ***"):      #### geometry converged
+                    successful_opt += 1
+                elif line.startswith("VIBRATIONAL FREQUENCIES"):                                        #### a frequency job was completed
+                    successful_freq += 1                                                                
+                elif line.startswith("Maximum memory used throughout the entire EPRNMR-calculation:"):  #### an EPR NMR job was completed
+                    successful_NMR_EPR += 1
+                elif line.strip().startswith("*    Relaxed Surface Scan    *"):                         #### this is a scan job
+                    is_scan_job = True
+                elif line.startswith("Sum of individual times         ..."):                            #### the job was completed
                     fields = line.split()
-                    assert len(fields) == 13, f"unexpected number of fields on elapsed time line:\n{line}"
-                    days = float(fields[3])
-                    hours = float(fields[5])
-                    minutes = float(fields[7])
-                    seconds = float(fields[9])
-                    elapsed_time = days * 86400 + hours * 3600 + minutes * 60 + seconds
+                    assert len(fields) == 10, f"unexpected number of fields on elapsed time line:\n{line}"
+                    elapsed_time = float(fields[5])
+
+            # different than G16 "successful termination"
+            success = 0   
+            if successful_opt > 0:
+                success += 1
+            if successful_freq > 0:
+                success += 1
+            if successful_NMR_EPR > 0:
+                success += 1
+            if successful_scf_convergence > 0:
+                success += 1
 
             energies, iters = parse.read_energies(lines)
             if len(energies) == 0:
@@ -128,8 +143,9 @@ def read_file(cls, filename):
             atomic_numbers, geometries = parse.read_geometries(lines, num_to_find=len(energies))
             assert len(geometries) >= len(energies), "can't have an energy without a geometry (cf. pigeonhole principle)"
 
-            charge = lines.find_parameter("xyz", 6, 4)[0]
-            multip = lines.find_parameter("xyz", 6, 5)[0]
+            # this approach does not work with the option miniprint
+            charge = lines.find_parameter("Total Charge           Charge          ....", 5, 4)[0]
+            multip = lines.find_parameter("Multiplicity           Mult            ....", 4, 3)[0]
 
             #### TODO
             # detect Mayer bond orders
@@ -169,35 +185,44 @@ def read_file(cls, filename):
                         properties[idx]["max_step"] = max_step[idx]
 
             if OrcaJobType.FREQ in job_types:
-                properties[-1]["frequencies"] = sorted(parse.read_freqs(lines))
+                properties[-1]["frequencies"] = sorted(parse.read_freqs(lines, successful_freq))
 
                 enthalpies = lines.find_parameter("Total Enthalpy", expected_length=5, which_field=3)
-                properties[-1]["enthalpy"] = enthalpies[-1]
+                try:
+                    properties[-1]["enthalpy"] = enthalpies[-1]
+                except Exception as e:
+                    pass
 
                 gibbs = lines.find_parameter("Final Gibbs free", expected_length=7, which_field=5)
-                properties[-1]["gibbs_free_energy"] = gibbs[-1]
-
-                temperature = lines.find_parameter("Temperature", expected_length=4, which_field=2)
-                if len(temperature) > 0 and len(gibbs) > 0:
-                    properties[-1]["temperature"] = temperature[-1]
-                    corrected_free_energy = get_corrected_free_energy(gibbs[-1], properties[-1]["frequencies"],
+                try:
+                    properties[-1]["gibbs_free_energy"] = gibbs[-1]
+                except Exception as e:
+                    pass
+
+                try:
+                    temperature = lines.find_parameter("Temperature", expected_length=4, which_field=2)
+                    if len(temperature) > 0 and len(gibbs) > 0:
+                        properties[-1]["temperature"] = temperature[-1]
+                        corrected_free_energy = get_corrected_free_energy(gibbs[-1], properties[-1]["frequencies"],
                                                                       frequency_cutoff=100.0, temperature=temperature[-1])
-                    properties[-1]["quasiharmonic_gibbs_free_energy"] = float(corrected_free_energy)
+                        properties[-1]["quasiharmonic_gibbs_free_energy"] = float(corrected_free_energy)
+                except Exception as e:
+                    pass
 
             if OrcaJobType.NMR in job_types:
                 nmr_shifts = parse.read_nmr_shifts(lines, molecules[0].num_atoms())
                 if nmr_shifts is not None:
                     properties[-1]["isotropic_shielding"] = nmr_shifts
 
             try:
-                charges = parse.read_mulliken_charges(lines)
+                charges = parse.read_mulliken_charges(lines, successful_opt, is_scan_job)
                 assert len(charges) == len(atomic_numbers)
                 properties[-1]["mulliken_charges"] = charges
             except Exception as e:
                 pass
 
             try:
-                charges = parse.read_loewdin_charges(lines)
+                charges = parse.read_loewdin_charges(lines, successful_opt, is_scan_job)
                 assert len(charges) == len(atomic_numbers)
                 properties[-1]["lowdin_charges"] = charges
             except Exception as e:
@@ -253,7 +278,7 @@ def write_file(self, filename, molecule=None, header=None, variables=None, block
         self.write_molecule_to_file(filename, molecule, header, variables, blocks)
 
     @classmethod
-    def write_molecule_to_file(cls, filename, molecule, header, variables=None, blocks=None, print_symbol=False):
+    def write_molecule_to_file(cls, filename, molecule, header, variables={"maxcore": 2000}, blocks={"pal": ["nproc 16"]}, print_symbol=False):
         """
         Write an ``.inp`` file using the given molecule.
 
@@ -336,7 +361,7 @@ def imaginaries(self):
     @classmethod
     def _assign_job_types(cls, header):
         """
-        Assigns ``OrcaJobType`` objects from route card. ``OrcaJobType.SP`` is assigned by default.
+        Assigns ``OrcaJobType`` objects from header. ``OrcaJobType.SP`` is assigned by default.
 
         Args:
             header (str): Orca header
@@ -345,11 +370,13 @@ def _assign_job_types(cls, header):
             list of ``OrcaJobType`` objects
         """
         job_types = []
-        for name, member in OrcaJobType.__members__.items():
-            if re.search(f" {member.value}", str(header), re.IGNORECASE):
-                job_types.append(member)
+        for job_type in OrcaJobType:
+            if re.search(f"{job_type.value}", str(header), re.IGNORECASE):
+                job_types.append(job_type)
         if OrcaJobType.SP not in job_types:
-            job_types.append(OrcaJobType.SP)
+            job_types.append(OrcaJobType.SP) # include SP in all jobs, whether specified in header or not
+        if re.search("ScanTs", str(header), re.IGNORECASE):
+            job_types.append(OrcaJobType.OPT) #ScanTs is an optimization job that does not contain the string "opt" in it's keyword
         return job_types
 
     def check_has_properties(self):
@@ -359,8 +386,10 @@ def check_has_properties(self):
         This only checks the last molecule in ``self.ensemble``, for now.
         """
         if self.successful_terminations > 0:
+            if self.successful_terminations == 2 and ((OrcaJobType.OPT in self.job_types) and (OrcaJobType.FREQ in self.job_types)):
+                return # opt and freq jobs should have three terminations
             for job_type in self.job_types:
-                for prop in EXPECTED_PROPERTIES[job_type.value]:
+                for prop in job_type.expected_properties:
                     if not self.ensemble.has_property(-1, prop):
                         raise ValueError(f"expected property {prop} for job type {job_type}, but it's not there!")
         else:

diff --git a/cctk/parse_orca.py b/cctk/parse_orca.py
@@ -1,5 +1,5 @@
 import numpy as np
-import re
+import re, warnings
 
 from cctk.helper_functions import get_number
 from cctk import OneIndexedArray, LazyLineObject
@@ -61,59 +61,87 @@ def split_multiple_inputs(filename):
     start_block = 0
     with open(filename, "r") as lines:
         for idx, line in enumerate(lines):
-            if re.search("COMPOUND  JOB \d{1,}", line):
+            if re.search("COMPOUND JOB  \d{1,}", line):
                 output_blocks.append(LazyLineObject(file=filename, start=start_block, end=idx))
                 start_block = idx
     output_blocks.append(LazyLineObject(file=filename, start=start_block, end=idx))
 
-    return output_blocks[:]
+    if len(output_blocks) <= 1:
+        return output_blocks[:]
 
-def read_mulliken_charges(lines):
+    # this conditional skips the first block of %compound jobs which only describes the input and doesn't have an associated energy
+    elif len(output_blocks) > 1:
+        return output_blocks[1:]
+
+def read_mulliken_charges(lines, successful_opt, is_scan_job):
     """
-    Reads charges.
+    Reads charges. Returns charges on penultimate geometry for some scan jobs. 
 
     Args:
         lines (list): list of lines in file
 
     Returns:
         ``cctk.OneIndexedArray`` of charges
     """
-    charges = []
-    charge_block = lines.search_for_block("MULLIKEN ATOMIC CHARGES", "Sum of atomic charges", join="\n")
-    for line in charge_block.split("\n")[2:]:
-        fields = re.split(" +", line)
-        fields = list(filter(None, fields))
 
-        if len(fields) == 4:
-            charges.append(float(fields[3]))
+    count = successful_opt + 1
+    if is_scan_job:
+        count = 2*successful_opt
+    else: 
+        count = successful_opt + 1
+
+    charge_block = lines.search_for_block("MULLIKEN ATOMIC CHARGES", "Sum of atomic charges", count=count, join="\n")
+    if not isinstance(charge_block, list):
+        charge_block = [charge_block]
+
+    for block in charge_block:
+        charges = []
+        for line in block.split("\n")[2:]:
+            fields = re.split(" +", line)
+            fields = list(filter(None, fields))
+            if len(fields) == 4:
+                charges.append(float(fields[3]))
 
     return OneIndexedArray(charges)
 
 
-def read_loewdin_charges(lines):
+
+def read_loewdin_charges(lines, successful_opt, is_scan_job):
     """
-    Reads charges.
+    Reads charges. Returns charges on penultimate geometry for some scan jobs. 
 
     Args:
         lines (list): list of lines in file
 
     Returns:
         ``cctk.OneIndexedArray`` of charges
     """
-    charges = []
-    charge_block = lines.search_for_block("LOEWDIN ATOMIC CHARGES", "^$", join="\n")
-    for line in charge_block.split("\n")[2:]:
-        fields = re.split(" +", line)
-        fields = list(filter(None, fields))
-
-        if len(fields) == 4:
-            charges.append(float(fields[3]))
+    count = successful_opt + 1
+    if is_scan_job:
+        count = 2*successful_opt
+    else: 
+        count = successful_opt + 1
+
+    charge_block = lines.search_for_block("LOEWDIN ATOMIC CHARGES", "^$", count=count, join="\n")
+    if not isinstance(charge_block, list):
+        charge_block = [charge_block]
+
+    for block in charge_block:
+        charges = []
+        for line in block.split("\n")[2:]:
+            fields = re.split(" +", line)
+            fields = list(filter(None, fields))
+            if len(fields) == 4:
+                charges.append(float(fields[3]))
 
     return OneIndexedArray(charges)
 
+
 def read_header(lines):
     for line in lines:
         if re.match("!", line):
+            if 'miniprint' in line.lower():
+                warnings.warn('The miniprint option may lead to parsing errors in cctk')
             return line
 
 def read_blocks_and_variables(lines):
@@ -149,17 +177,33 @@ def extract_input_file(lines):
         input_lines.append(line)
     return input_lines
 
-def read_freqs(lines):
-    freq_block = lines.search_for_block("VIBRATIONAL FREQUENCIES", "NORMAL MODES", join="\n", max_len=1000)
-    if freq_block is None:
+
+
+def read_freqs(lines, successful_freq):
+    freq_blocks = lines.search_for_block("VIBRATIONAL FREQUENCIES", "NORMAL MODES", count=successful_freq, join="\n", max_len=1000)
+    if freq_blocks is None:
         return []
-    freqs = []
-    for line in freq_block.split("\n"):
-        fields = re.split(" +", line.strip())
-        if len(fields) == 3:
-            if fields[2] == "cm**-1" and float(fields[1]) > 0:
-                freqs.append(float(fields[1]))
-    return freqs
+
+    elif successful_freq == 1:
+        freqs = []
+        for line in freq_blocks.split("\n"):
+            fields = re.split(" +", line.strip())
+            if len(fields) == 3 or len(fields) == 5:
+                if fields[2] == "cm**-1" and float(fields[1]) != 0:
+                    freqs.append(float(fields[1]))
+        return freqs
+
+    elif successful_freq > 1 :
+        freqs_lists = []
+        for freq_block in freq_blocks:
+            freqs = []
+            for line in freq_block.split("\n"):
+                fields = re.split(" +", line.strip())
+                if len(fields) == 3 or len(fields) == 5:
+                    if fields[2] == "cm**-1" and float(fields[1]) != 0:
+                        freqs.append(float(fields[1]))
+            freqs_lists.append(freqs)
+        return freqs_lists[-1]
 
 def read_gradients(lines, num_to_find):
     grad_blocks = lines.search_for_block("Geometry convergence", "Max\(Bonds", join="\n", count=num_to_find)

diff --git a/docs/img/r09_step_vs_energy.png b/docs/img/r09_step_vs_energy.png
diff --git a/docs/img/r09_step_vs_rms_grad.png b/docs/img/r09_step_vs_rms_grad.png
diff --git a/docs/installation.rst b/docs/installation.rst
@@ -74,7 +74,7 @@ To run a single unit test::
 
 To run all the tests::
 
-    pythom -m unittest discover
+    python -m unittest discover
Original file line number	Diff line number	Diff line change
Expand Up		@@ -74,7 +74,7 @@ To run a single unit test::

		To run all the tests::

		pythom -m unittest discover
		python -m unittest discover



Expand Down