diff --git a/.gitignore b/.gitignore index 90350c8d..8d08f814 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,7 @@ slurm-*.out *~ *.log .env +# HON only +config.yml +config.aws.yml +src/utilities/download_files.py \ No newline at end of file diff --git a/config.yml b/config.yml index 6f359960..eb0d9a02 100644 --- a/config.yml +++ b/config.yml @@ -3,8 +3,9 @@ ## General RunName: "Test_Permian_1week" +Species: "CH4" isAWS: true -UseSlurm: true +SchedulerType: "slurm" SafeMode: true S3Upload: false @@ -13,8 +14,16 @@ StartDate: 20180501 EndDate: 20180508 SpinupMonths: 1 -## Use blended TROPOMI+GOSAT data (true)? Or use operational TROPOMI data (false)? -BlendedTROPOMI: false +## What satellite data product should be used? Current options are: +## "BlendedTROPOMI" : The dataset generated by Balasus et al. in which +## the TROPOMI data are fit to GOSAT data using ML +## "TROPOMI" : The operational TROPOMI data +## "Other" : Any other dataset +## Currently, only "BlendedTROPOMI" and "TROPOMI" are supported on AWS. If +## "Other" is selected, the user must specify the path where observations are +## located under "Advanced settings" in this file. +SatelliteProduct: "Other" +# BlendedTROPOMI: false ## Use observations over water? Set to false to filter out water observations UseWaterObs: false @@ -128,10 +137,10 @@ DOFSThreshold: 0 ## Resource allocation settings for slurm jobs RequestedCPUs: 8 -RequestedMemory: 10000 +RequestedMemory: "10gb" InversionCPUs: 32 -InversionMemory: 32000 -RequestedTime: "0-24:00" +InversionMemory: "32gb" +RequestedTime: "24:00:00" SchedulerPartition: "debug" ## Max number of simultaneous Jacobian runs from the job array (-1: no limit) @@ -169,7 +178,7 @@ PerturbValueBCs: 10.0 ## Save out hourly diagnostics from GEOS-Chem? ## For use in satellite operators via post-processing -- required for TROPOMI ## inversions -HourlyCH4: true +HourlySpecies: true ## Turn on planeflight diagnostic in GEOS-Chem? ## For use in comparing GEOS-Chem against planeflight data. The path @@ -183,6 +192,9 @@ GOSAT: false TCCON: false AIRS: false +## Use global boundary condition files for initial conditions +UseBCsForRestart: False + ##------------------------------------------------------------------ ## Settings for running on local cluster ##------------------------------------------------------------------ @@ -194,8 +206,7 @@ OutputPath: "/home/ubuntu/imi_output_dir" DataPath: "/home/ubuntu/ExtData" ## Conda environment file -CondaFile: "/home/ubuntu/.bashrc" -CondaEnv: "imi_env" +PythonEnv: "/home/ubuntu/integrated_methane_inversion/envs/aws/python.env" ## Download initial restart file from AWS S3? ## NOTE: Must have AWS CLI enabled diff --git a/docs/source/advanced/local-cluster.rst b/docs/source/advanced/local-cluster.rst index b97a4b47..86420873 100644 --- a/docs/source/advanced/local-cluster.rst +++ b/docs/source/advanced/local-cluster.rst @@ -51,18 +51,19 @@ for AWS and Harvard's Cannon cluster. $ ls envs/* envs/aws: - conda_env.yml slurm/ spack_env.env + conda_env.yml python.env slurm/ spack_env.env envs/Harvard-Cannon: - ch4_inv.yml gcclassic.rocky+gnu10.minimal.env* gcclassic.rocky+gnu10.env* - config.harvard-cannon.yml gcclassic.rocky+gnu12.minimal.env* README + ch4_inv.yml gcclassic.rocky+gnu10.minimal.env* gcclassic.rocky+gnu10.env* python.env + config.harvard-cannon.yml gcclassic.rocky+gnu12.minimal.env* imi_env.yml README We recommend you add a subfolder within ``envs`` for your own system to easily access your customized files needed for the IMI. In this directory, we recommend storing any environment files needed to load the libraries for GEOS-Chem (e.g. fortran compiler, netcdf, openmpi, -cmake), a conda environment file, and a copy of the IMI configuration file -modified for your system. See the files in ``envs/Harvard-Cannon`` for examples. +cmake), a Python environment file, a file that activates your Python +environment, and a copy of the IMI configuration fil emodified for +your system. See the files in ``envs/Harvard-Cannon`` for examples. We recommend basing your config file off of ``config.harvard-cannon.yml``. Within the copied IMI configuration file, you will need to modify the @@ -77,7 +78,7 @@ modules" and "Run modules" and turning them on one or a few at a time. You may find that you need to manually edit some files. For example, after creating the template run directory, but before creating your spinup, Jacobian, and posterior run directories, you should open -``ch4_run.template`` in a text editor and modify as needed for your +``run.template`` in a text editor and modify as needed for your system (by default this script is set up to submit to a SLURM scheduler). diff --git a/docs/source/advanced/running-with-tmux.rst b/docs/source/advanced/running-with-tmux.rst index 75fc6f4b..5985c540 100644 --- a/docs/source/advanced/running-with-tmux.rst +++ b/docs/source/advanced/running-with-tmux.rst @@ -7,7 +7,7 @@ allows you to run a program on your EC2 instance, disconnect, and then reconnect Because of the way the IMI is parallelized, using tmux can grant a small to moderate speed-up. .. note:: - Before running the IMI with tmux, make sure the ``UseSlurm`` option in the :doc:`configuration file <../getting-started/imi-config-file>` + Before running the IMI with tmux, make sure the ``UseScheduler`` option in the :doc:`configuration file <../getting-started/imi-config-file>` is set to ``false``. Using tmux diff --git a/docs/source/getting-started/imi-config-file.rst b/docs/source/getting-started/imi-config-file.rst index 233d596a..d5d95be0 100644 --- a/docs/source/getting-started/imi-config-file.rst +++ b/docs/source/getting-started/imi-config-file.rst @@ -12,10 +12,9 @@ General - Name for this inversion; will be used for directory names and prefixes. * - ``isAWS`` - Boolean for running the IMI on AWS (``true``) or a local cluster (``false``). - * - ``UseSlurm`` - - Boolean for running the IMI as a batch job with ``sbatch`` instead of interactively. - Select ``true`` to run the IMI with ``sbatch run_imi.sh``. - Select ``false`` to run the IMI with ``./run_imi.sh`` (:doc:`via tmux <../advanced/running-with-tmux>`). + * - ``SchedulerType`` + - String defining the type of scheduler used to run the IMI. + Currently supported options are "slurm" or "PBS". * - ``SafeMode`` - Boolean for running in safe mode to prevent overwriting existing files. * - ``S3Upload`` @@ -294,7 +293,7 @@ These settings are intended for advanced users who wish to modify additional GEO - Value to perturb OH by if using ``OptimizeOH``. Default value is ``1.5``. * - ``PerturbValueBCs`` - Number of ppb to perturb emissions by for domain edges (North, South, East, West) if using ``OptimizeBCs``. Default value is ``10.0`` ppb. - * - ``HourlyCH4`` + * - ``HourlySpecies`` - Boolean to save out hourly diagnostics from GEOS-Chem. This output is used in satellite operators via post-processing. Default value is ``true``. * - ``PLANEFLIGHT`` - Boolean to save out the planeflight diagnostic in GEOS-Chem. This output may be used to compare GEOS-Chem against planeflight data. The path to those data must be specified in input.geos. See the `planeflight diagnostic `_ documentation for details. Default value is ``false``. @@ -318,12 +317,10 @@ the IMI on a local cluster<../advanced/local-cluster>`). - Path for IMI runs and output. * - ``DataPath`` - Path to GEOS-Chem input data. - * - ``DataPathTROPOMI`` - - Path to TROPOMI input data. - * - ``CondaFile`` - - Path to file containing Conda environment settings. - * - ``CondaEnv`` - - Name of conda environment. + * - ``DataPathObs`` + - Path to satellite input data. + * - ``PythonEnv`` + - Path to file that activates the Python environment. * - ``RestartDownload`` - Boolean for downloading an initial restart file from AWS S3. Default value is ``true``. * - ``RestartFilePrefix`` diff --git a/envs/Harvard-Cannon/config.harvard-cannon.global_inv.yml b/envs/Harvard-Cannon/config.harvard-cannon.global_inv.yml index 99dc2486..c1711b3f 100644 --- a/envs/Harvard-Cannon/config.harvard-cannon.global_inv.yml +++ b/envs/Harvard-Cannon/config.harvard-cannon.global_inv.yml @@ -3,8 +3,9 @@ ## General RunName: "Test_IMI_Global" +Species: "CH4" isAWS: false -UseSlurm: true +SchedulerType: "slurm" SafeMode: true S3Upload: false @@ -13,8 +14,16 @@ StartDate: 20180501 EndDate: 20180502 SpinupMonths: 1 -## Use blended TROPOMI+GOSAT data (true)? Or use operational TROPOMI data (false)? -BlendedTROPOMI: false +## What satellite data product should be used? Current options are: +## "BlendedTROPOMI" : The dataset generated by Balasus et al. in which +## the TROPOMI data are fit to GOSAT data using ML +## "TROPOMI" : The operational TROPOMI data +## "Other" : Any other dataset +## Currently, only "BlendedTROPOMI" and "TROPOMI" are supported on AWS. If +## "Other" is selected, the user must specify the path where observations are +## located under "Advanced settings" in this file. +SatelliteProduct: "Other" +# BlendedTROPOMI: false ## Use observations over water? Set to false to filter out water observations UseWaterObs: false @@ -128,10 +137,10 @@ DOFSThreshold: 0 ## Resource allocation settings for slurm jobs RequestedCPUs: 32 -RequestedMemory: 32000 +RequestedMemory: "32gb" InversionCPUs: 32 -InversionMemory: 64000 -RequestedTime: "0-24:00" +InversionMemory: "64gb" +RequestedTime: "24:00:00" SchedulerPartition: "sapphire,huce_cascade,seas_compute,shared" ## Max number of simultaneous Jacobian runs from the job array (-1: no limit) @@ -169,7 +178,7 @@ PerturbValueBCs: 10.0 ## Save out hourly diagnostics from GEOS-Chem? ## For use in satellite operators via post-processing -- required for TROPOMI ## inversions -HourlyCH4: true +HourlySpecies: true ## Turn on planeflight diagnostic in GEOS-Chem? ## For use in comparing GEOS-Chem against planeflight data. The path @@ -183,6 +192,9 @@ GOSAT: false TCCON: false AIRS: false +## Use global boundary condition files for initial conditions +UseBCsForRestart: False + ##------------------------------------------------------------------ ## Settings for running on local cluster ##------------------------------------------------------------------ @@ -194,17 +206,15 @@ OutputPath: "/n/holyscratch01/jacob_lab/$USER" DataPath: "/n/holyscratch01/external_repos/GEOS-CHEM/gcgrid/gcdata/ExtData" ## Path to TROPOMI Data -DataPathTROPOMI: "/n/holylfs05/LABS/jacob_lab/imi/ch4/tropomi" - -## Conda environment file -## See envs/README to create the Conda environment specified below -CondaEnv: "imi_env" -CondaFile: "${HOME}/.bashrc" +DataPathObs: "/n/holylfs05/LABS/jacob_lab/imi/ch4/tropomi" ## GEOS-Chem environment file (with fortran compiler, netcdf libraries, etc.) ## NOTE: Copy your own file in the envs/ directory within the IMI GEOSChemEnv: "envs/Harvard-Cannon/gcclassic.rocky+gnu12.minimal.env" +## Python environment file (this is normally one or two lines) +PythonEnv: "envs/Harvard-Cannon/python.env" + ## Download initial restart file from AWS S3? ## NOTE: Must have AWS CLI enabled RestartDownload: false diff --git a/envs/Harvard-Cannon/config.harvard-cannon.yml b/envs/Harvard-Cannon/config.harvard-cannon.yml index a56f8e2b..8419089f 100644 --- a/envs/Harvard-Cannon/config.harvard-cannon.yml +++ b/envs/Harvard-Cannon/config.harvard-cannon.yml @@ -3,8 +3,9 @@ ## General RunName: "Test_IMI_Permian" +Species: "CH4" isAWS: false -UseSlurm: true +SchedulerType: "slurm" SafeMode: true S3Upload: false @@ -13,8 +14,15 @@ StartDate: 20180501 EndDate: 20180508 SpinupMonths: 1 -## Use blended TROPOMI+GOSAT data (true)? Or use operational TROPOMI data (false)? -BlendedTROPOMI: false +## What satellite data product should be used? Current options are: +## "BlendedTROPOMI" : The dataset generated by Balasus et al. in which +## the TROPOMI data are fit to GOSAT data using ML +## "TROPOMI" : The operational TROPOMI data +## "Other" : Any other dataset +## Currently, only "BlendedTROPOMI" and "TROPOMI" are supported on AWS. If +## "Other" is selected, the user must specify the path where observations are +## located under "Advanced settings" in this file. +SatelliteProduct: "Other" ## Use observations over water? Set to false to filter out water observations UseWaterObs: false @@ -128,10 +136,10 @@ DOFSThreshold: 0 ## Resource allocation settings for slurm jobs RequestedCPUs: 32 -RequestedMemory: 32000 +RequestedMemory: "32gb" InversionCPUs: 32 -InversionMemory: 64000 -RequestedTime: "0-24:00" +InversionMemory: "64gb" +RequestedTime: "24:00:00" SchedulerPartition: "sapphire,huce_cascade,seas_compute,shared" ## Max number of simultaneous Jacobian runs from the job array (-1: no limit) @@ -169,7 +177,7 @@ PerturbValueBCs: 10.0 ## Save out hourly diagnostics from GEOS-Chem? ## For use in satellite operators via post-processing -- required for TROPOMI ## inversions -HourlyCH4: true +HourlySpecies: true ## Turn on planeflight diagnostic in GEOS-Chem? ## For use in comparing GEOS-Chem against planeflight data. The path @@ -183,6 +191,9 @@ GOSAT: false TCCON: false AIRS: false +## Use global boundary condition files for initial conditions +UseBCsForRestart: False + ##------------------------------------------------------------------ ## Settings for running on local cluster ##------------------------------------------------------------------ @@ -194,17 +205,15 @@ OutputPath: "/n/holyscratch01/jacob_lab/$USER" DataPath: "/n/holyscratch01/external_repos/GEOS-CHEM/gcgrid/gcdata/ExtData" ## Path to TROPOMI Data -DataPathTROPOMI: "/n/holylfs05/LABS/jacob_lab/imi/ch4/tropomi" - -## Conda environment file -## See envs/README to create the Conda environment specified below -CondaEnv: "imi_env" -CondaFile: "${HOME}/.bashrc" +DataPathObs: "/n/holylfs05/LABS/jacob_lab/imi/ch4/tropomi" ## GEOS-Chem environment file (with fortran compiler, netcdf libraries, etc.) ## NOTE: Copy your own file in the envs/ directory within the IMI GEOSChemEnv: "envs/Harvard-Cannon/gcclassic.rocky+gnu12.minimal.env" +## Python environment file (this is normally one or two lines) +PythonEnv: "envs/Harvard-Cannon/python.env" + ## Download initial restart file from AWS S3? ## NOTE: Must have AWS CLI enabled RestartDownload: false diff --git a/envs/Harvard-Cannon/python.env b/envs/Harvard-Cannon/python.env new file mode 100644 index 00000000..bacb71ea --- /dev/null +++ b/envs/Harvard-Cannon/python.env @@ -0,0 +1,6 @@ +#============================================================================== +# Load Python environment +#============================================================================== +printf "\nActivating conda environment: imi_env\n" +source ~/.bashrc +conda activate imi_env diff --git a/envs/NASA-Pleiades/config.nasa-pleiades.global_inv.yml b/envs/NASA-Pleiades/config.nasa-pleiades.global_inv.yml new file mode 100644 index 00000000..d5a92509 --- /dev/null +++ b/envs/NASA-Pleiades/config.nasa-pleiades.global_inv.yml @@ -0,0 +1,230 @@ +## IMI configuration file +## Documentation @ https://imi.readthedocs.io/en/latest/getting-started/imi-config-file.html + +## General +RunName: "Test_ICI_Global" +Species: "CO2" +isAWS: false +SchedulerType: "PBS" +SafeMode: false +S3Upload: false + +## Period of interest +StartDate: 20141001 +EndDate: 20160401 +SpinupMonths: 1 + +## What satellite data product should be used? Current options are: +## "BlendedTROPOMI" : The dataset generated by Balasus et al. in which +## the TROPOMI data are fit to GOSAT data using ML +## "TROPOMI" : The operational TROPOMI data +## "Other" : Any other dataset +## Currently, only "BlendedTROPOMI" and "TROPOMI" are supported on AWS. If +## "Other" is selected, the user must specify the path where observations are +## located under "Advanced settings" in this file. +SatelliteProduct: "Other" + +## Is this a regional inversion? Set to false for global inversion +isRegional: false + +## Select two character region ID (for using pre-cropped meteorological fields) +## Current options are listed below with ([lat],[lon]) bounds: +## "AF" : Africa ([-37,40], [-20,53]) +## "AS" : Asia ([-11,55],[60,150]) +## "EU" : Europe ([33,61],[-30,70]) +## "ME" : Middle East ([12,50], [-20,70]) +## "NA" : North America ([10,70],[-140,-40]) +## "OC" : Oceania ([-50,5], [110,180]) +## "RU" : Russia ([41,83], [19,180]) +## "SA" : South America ([-59,16], [-88,-31]) +## "" : Use for global global simulation or custom regions +## For example, if the region of interest is in Europe ([33,61],[-30,70]), select "EU". +RegionID: "" + +## Region of interest +## These lat/lon bounds are only used if CreateAutomaticRectilinearStateVectorFile: true +## Otherwise lat/lon bounds are determined from StateVectorFile +LonMin: -102.5 +LonMax: -87.5 +LatMin: 16 +LatMax: 24 + +## Kalman filter options +KalmanMode: false +UpdateFreqDays: 7 +NudgeFactor: 0.1 + +## State vector +CreateAutomaticRectilinearStateVectorFile: true +nBufferClusters: 0 +BufferDeg: 0 +LandThreshold: 0.25 +OffshoreEmisThreshold: 0 +OptimizeBCs: false +OptimizeOH: false + +## Point source datasets +## Used for visualizations and state vector clustering +PointSourceDatasets: ["SRON"] + +## Clustering Options +ReducedDimensionStateVector: false +DynamicKFClustering: false +ClusteringMethod: "kmeans" +NumberOfElements: 45 +ForcedNativeResolutionElements: + - [31.5, -104] +EmissionRateFilter: 2500 +PlumeCountFilter: 50 +GroupByCountry: false + +## Custom state vector +StateVectorFile: "/nobackupp27/hnesser/CO2_inversion/state_vector/clusters_annual.nc" +ShapeFile: "None" + +## Inversion +## Note PriorError and PriorErrorOH are relative fractions (e.g. 0.5 = 50%) +## and PriorErrorBCs is in ppb +LognormalErrors: false +PriorError: 0.5 +PriorErrorBCs: 10.0 +PriorErrorOH: 0.5 +ObsError: 15 +Gamma: 1.0 +PrecomputedJacobian: false + +## Grid +## Options are 0.25x0.3125 (GEOSFP only), 0.5x0.625, 2.0x2.5, or 4.0x5.0 +Res: "2.0x2.5" + +## Meteorology +## Options are GEOSFP or MERRA2 +Met: "MERRA2" + +## Setup modules +## Turn on/off different steps in setting up the inversion +RunSetup: true +SetupTemplateRundir: true +SetupSpinupRun: false +SetupJacobianRuns: true +SetupInversion: false +SetupPosteriorRun: false + +## Run modules +## Turn on/off different steps in performing the inversion +DoPriorEmis: false +DoSpinup: false +DoJacobian: false +ReDoJacobian: false +DoInversion: false +DoPosterior: false + +## IMI preview +## NOTE: RunSetup must be true to run preview +DoPreview: false +DOFSThreshold: 0 + +## Resource allocation settings for slurm jobs +RequestedCPUs: 16 +RequestedMemory: "20gb" +RequestedTime: "01:00:00" +SchedulerPartition: "devel" + +## Max number of simultaneous Jacobian runs from the job array (-1: no limit) +MaxSimultaneousRuns: -1 + +## Number of Jacobians tracers to use for each jacobian simulation +## Specifying a value = 1 will submit a separate jacobian simulation for each +## state vector element. Specifying a value > 1 will combine state vector +## elements into a single jacobian simulation. +NumJacobianTracers: 10 + +##==================================================================== +## +## Advanced Settings (optional) +## +##==================================================================== + +## These settings are intended for advanced users who wish to: +## a. modify additional GEOS-Chem options, or +## b. run the IMI on a local cluster. +## They can be ignored for any standard cloud application of the IMI. + +##-------------------------------------------------------------------- +## Additional settings for GEOS-Chem simulations +##-------------------------------------------------------------------- + +## Jacobian settings +## Note PerturbValue and PerturbValueOH are relative scale factors and +## PerturbValueBCs is in ppb +PerturbValue: 1.0 +PerturbValueOH: 1.1 +PerturbValueBCs: 10.0 + +# ## Use eigenvector perturbations instead of grid cell perturbations in the +# ## GEOSChem run +# PerturbEigenvectors: true +# nEigenvectors: 263 + +## Save out hourly diagnostics from GEOS-Chem? +## For use in satellite operators via post-processing -- required for TROPOMI +## inversions +HourlySpecies: true + +## Turn on planeflight diagnostic in GEOS-Chem? +## For use in comparing GEOS-Chem against planeflight data. The path +## to those data must be specified in input.geos. +PLANEFLIGHT: false + +## Turn on old observation operators in GEOS-Chem? +## These will save out text files comparing GEOS-Chem to observations, but have +## to be manually incorporated into the IMI +GOSAT: false +TCCON: false +AIRS: false + +## Use global boundary condition files for initial conditions +UseBCsForRestart: False + +##------------------------------------------------------------------ +## Settings for running on local cluster +##------------------------------------------------------------------ + +## Path for IMI runs and output +OutputPath: "/nobackupp27/$USER/IMI_demo" + +## Path to GEOS-Chem input data +DataPath: "/nobackupp27/$USER/ExtData" + +## Path to satellite data +# DataPathObs: "/nobackup/$USER/CO2_inversion/observations/OCO-2" +DataPathObs: "/nobackupp27/$USER/IMI_demo/data_TROPOMI" + +## GEOS-Chem environment file (with fortran compiler, netcdf libraries, etc.) +## NOTE: Copy your own file in the envs/ directory within the IMI +GEOSChemEnv: "envs/NASA-Pleiades/gcclassic.pleiades.env" + +## Python environment file (this is normally one or two lines) +PythonEnv: "envs/NASA-Pleiades/python.env" + +## Download initial restart file from AWS S3? +## NOTE: Must have AWS CLI enabled +RestartDownload: false + +## Path to initial GEOS-Chem restart file + prefix +## ("YYYYMMDD_0000z.nc4" will be appended) +RestartFilePrefix: "/nobackup/$USER/CO2_inversion/restart_" +RestartFilePreviewPrefix: "/nobackup/$USER/CO2_inversion/restart_" + +## Path to GEOS-Chem boundary condition files (for regional simulations) +## BCversion will be appended to the end of this path. ${BCpath}/${BCversion} +BCpath: "/nobackup/$USER" +BCversion: "v2023-10" + +## Options to download missing GEOS-Chem input data from AWS S3 +## NOTE: Must have AWS CLI enabled +PreviewDryRun: false +SpinupDryrun: false +ProductionDryRun: false +PosteriorDryRun: false +BCdryrun: false diff --git a/envs/NASA-Pleiades/config.nasa-pleiades.yml b/envs/NASA-Pleiades/config.nasa-pleiades.yml new file mode 100644 index 00000000..1b9c08bd --- /dev/null +++ b/envs/NASA-Pleiades/config.nasa-pleiades.yml @@ -0,0 +1,220 @@ +## IMI configuration file +## Documentation @ https://imi.readthedocs.io/en/latest/getting-started/imi-config-file.html + +## General +RunName: "Test_ICI_Global" +Species: "CO2" +isAWS: false +SchedulerType: "PBS" +SafeMode: false +S3Upload: false + +## Period of interest +StartDate: 20221001 +EndDate: 20221003 +SpinupMonths: 1 + +## What satellite data product should be used? Current options are: +## "BlendedTROPOMI" : The dataset generated by Balasus et al. in which +## the TROPOMI data are fit to GOSAT data using ML +## "TROPOMI" : The operational TROPOMI data +## "Other" : Any other dataset +## Currently, only "BlendedTROPOMI" and "TROPOMI" are supported on AWS. If +## "Other" is selected, the user must specify the path where observations are +## located under "Advanced settings" in this file. +SatelliteProduct: "Other" +# BlendedTROPOMI: false + +## Is this a regional inversion? Set to false for global inversion +isRegional: false + +## Select two character region ID (for using pre-cropped meteorological fields) +## Current options are listed below with ([lat],[lon]) bounds: +## "AF" : Africa ([-37,40], [-20,53]) +## "AS" : Asia ([-11,55],[60,150]) +## "EU" : Europe ([33,61],[-30,70]) +## "ME" : Middle East ([12,50], [-20,70]) +## "NA" : North America ([10,70],[-140,-40]) +## "OC" : Oceania ([-50,5], [110,180]) +## "RU" : Russia ([41,83], [19,180]) +## "SA" : South America ([-59,16], [-88,-31]) +## "" : Use for global global simulation or custom regions +## For example, if the region of interest is in Europe ([33,61],[-30,70]), select "EU". +RegionID: "" + +## Region of interest +## These lat/lon bounds are only used if CreateAutomaticRectilinearStateVectorFile: true +## Otherwise lat/lon bounds are determined from StateVectorFile +LonMin: -102.5 +LonMax: -87.5 +LatMin: 16 +LatMax: 24 + +## Kalman filter options +KalmanMode: false +UpdateFreqDays: 7 +NudgeFactor: 0.1 + +## State vector +CreateAutomaticRectilinearStateVectorFile: true +nBufferClusters: +BufferDeg: 0 +OptimizeBCs: false +LandThreshold: 0.25 +OffshoreEmisThreshold: 0 +OptimizeOH: false + +## Point source datasets +## Used for visualizations and state vector clustering +PointSourceDatasets: ["SRON"] + +## Clustering Options +ReducedDimensionStateVector: false +DynamicKFClustering: false +ClusteringMethod: "kmeans" +NumberOfElements: 45 +ForcedNativeResolutionElements: + - [31.5, -104] + +## Custom state vector +StateVectorFile: "/path/to/StateVector.nc" +ShapeFile: "None" + +## Inversion +## Note PriorError and PriorErrorOH are relative fractions (e.g. 0.5 = 50%) +## and PriorErrorBCs is in ppb +PriorError: 0.5 +PriorErrorBCs: 10.0 +PriorErrorOH: 0.5 +ObsError: 15 +Gamma: 1.0 +PrecomputedJacobian: false + +## Grid +## Options are 0.25x0.3125 (GEOSFP only), 0.5x0.625, 2.0x2.5, or 4.0x5.0 +Res: "2.0x2.5" + +## Meteorology +## Options are GEOSFP or MERRA2 +Met: "MERRA2" + +## Setup modules +## Turn on/off different steps in setting up the inversion +SetupTemplateRundir: true +SetupSpinupRun: true +SetupJacobianRuns: true +SetupInversion: false +SetupPosteriorRun: false + +## Run modules +## Turn on/off different steps in performing the inversion +RunSetup: false +DoSpinup: true +DoJacobian: true +DoInversion: false +DoPosterior: false + +## IMI preview +## NOTE: RunSetup must be true to run preview +DoPreview: true +DOFSThreshold: 0 + +## Resource allocation settings for slurm jobs +SimulationCPUs: 32 +SimulationMemory: "32gb" +JacobianCPUs: 1 +JacobianMemory: 2000 +RequestedTime: "01:00:00" +SchedulerPartition: "debug" + +## Max number of simultaneous Jacobian runs from the job array (-1: no limit) +MaxSimultaneousRuns: 50 + +##==================================================================== +## +## Advanced Settings (optional) +## +##==================================================================== + +## These settings are intended for advanced users who wish to: +## a. modify additional GEOS-Chem options, or +## b. run the IMI on a local cluster. +## They can be ignored for any standard cloud application of the IMI. + +##-------------------------------------------------------------------- +## Additional settings for GEOS-Chem simulations +##-------------------------------------------------------------------- + +## Jacobian settings +## Note PerturbValue and PerturbValueOH are relative scale factors and +## PerturbValueBCs is in ppb +PerturbValue: 1.5 +PerturbValueOH: 1.5 +PerturbValueBCs: 10.0 + +## Apply scale factors from a previous inversion? +UseEmisSF: false +UseOHSF: false + +## Save out hourly diagnostics from GEOS-Chem? +## For use in satellite operators via post-processing -- required for TROPOMI +## inversions +HourlySpecies: true + +## Turn on planeflight diagnostic in GEOS-Chem? +## For use in comparing GEOS-Chem against planeflight data. The path +## to those data must be specified in input.geos. +PLANEFLIGHT: false + +## Turn on old observation operators in GEOS-Chem? +## These will save out text files comparing GEOS-Chem to observations, but have +## to be manually incorporated into the IMI +GOSAT: false +TCCON: false +AIRS: false + +## Use global boundary condition files for initial conditions +UseBCsForRestart: False + +##------------------------------------------------------------------ +## Settings for running on local cluster +##------------------------------------------------------------------ + +## Path for IMI runs and output +OutputPath: "/nobackupp27/$USER/IMI_demo" + +## Path to GEOS-Chem input data +DataPath: "/nobackupp27/$USER/ExtData" + +## Path to satellite data +# DataPathObs: "/nobackup/$USER/CO2_inversion/observations/OCO-2" +DataPathObs: "/nobackupp27/$USER/IMI_demo/data_TROPOMI" + +## GEOS-Chem environment file (with fortran compiler, netcdf libraries, etc.) +## NOTE: Copy your own file in the envs/ directory within the IMI +GEOSChemEnv: "envs/NASA-Pleiades/gcclassic.pleiades.env" + +## Python environment file (this is normally one or two lines) +PythonEnv: "envs/NASA-Pleiades/python.env" + +## Download initial restart file from AWS S3? +## NOTE: Must have AWS CLI enabled +RestartDownload: false + +## Path to initial GEOS-Chem restart file + prefix +## ("YYYYMMDD_0000z.nc4" will be appended) +RestartFilePrefix: "/nobackup/$USER/CO2_inversion/restart_" +RestartFilePreviewPrefix: "/nobackup/$USER/CO2_inversion/restart_" + +## Path to GEOS-Chem boundary condition files (for regional simulations) +## BCversion will be appended to the end of this path. ${BCpath}/${BCversion} +BCpath: "/nobackup/$USER" +BCversion: "v2023-10" + +## Options to download missing GEOS-Chem input data from AWS S3 +## NOTE: Must have AWS CLI enabled +PreviewDryRun: false +SpinupDryrun: false +ProductionDryRun: false +PosteriorDryRun: false +BCdryrun: false diff --git a/envs/NASA-Pleiades/gcclassic.pleiades.env b/envs/NASA-Pleiades/gcclassic.pleiades.env new file mode 100644 index 00000000..6b1758e1 --- /dev/null +++ b/envs/NASA-Pleiades/gcclassic.pleiades.env @@ -0,0 +1,70 @@ + +#============================================================================== +# gcclassic.pleiades.env +# Environment file for GCClassic on Pleiades +# +# Compilers: +# Intel or GNU Available versions can be found my typing "module avail" +# All theoretically available packages can be found here: +# https://www.nas.nasa.gov/hecc/support/kb/software-on-nas-systems_116.html +# +# Additional software: +# git Present always. This can be checked with "git version" +# CMake Present always. Version 3.13 or later is needed. This can +# be checked with "cmake --version" +#============================================================================== + +# Display message (if we are in a terminal window) +if [[ $- = *i* ]] ; then + echo "Loading modules for GEOS-Chem Classic." +fi + +# Unload packages loaded previously using "module load" +module purge + +# Load intel compilers +module load comp-intel/2019.5.281 + +# netCDF-Fortran +module load szip/2.1.1 +module load mpi-hpe/mpt +module load hdf4/4.2.12 +module load hdf5/1.8.18_mpt +module load netcdf/4.4.1.1_mpt + +# Load python for postprocessing +# Right now, this has most of the modules I need. +# Eventually, I'll make my own environment. +# module load python3/3.9.12 + +# And load node_stats.sh. +module load scicon/cli_tools + +# # Load mpi-intel +# module use -a /nasa/modulefiles/testing +# module load mpi-intel/2019.5.281 + +#============================================================================== +# Environment variables +#============================================================================== +# # Parallelization +export OMP_NUM_THREADS=8 +export OMP_STACKSIZE=500m + +# Make all files world-readable by default +umask 022 + +# Specify compilers +export CC=icc +export CXX=icpc +export FC=ifort + +# # Netcdf variables for CMake +# # NETCDF_HOME and NETCDF_FORTRAN_HOME are automatically +# # defined by the "module load" commands on Cannon. +# export NETCDF_C_ROOT=${NETCDF_HOME} +# export NETCDF_FORTRAN_ROOT=${NETCDF_FORTRAN_HOME} + +# List modules loaded +module list + diff --git a/envs/NASA-Pleiades/pip_requirements.txt b/envs/NASA-Pleiades/pip_requirements.txt new file mode 100644 index 00000000..78a2e954 --- /dev/null +++ b/envs/NASA-Pleiades/pip_requirements.txt @@ -0,0 +1,30 @@ +Cartopy==0.22.0 +cftime==1.6.2 +contourpy==1.1.0 +dask==2023.9.1 +debugpy==1.8.0 +decorator==5.1.1 +geopandas==0.14.1 +geopy==2.4.0 +h5netcdf==1.3.0 +h5py==3.9.0 +ipython==8.15.0 +jupyterlab==4.0.5 +mat73==0.62 +matplotlib==3.7.2 +netCDF4==1.6.4 +numpy==1.24.4 +pandas==2.1.0 +pickleshare==0.7.5 +Pillow==10.0.0 +pip==23.2.1 +pyproj==3.6.0 +pyshp==2.3.1 +pytest==7.4.2 +PyYAML==6.0.1 +scikit-learn==1.3.2 +scipy==1.11.2 +shapely==2.0.1 +sparse==0.14.0 +xarray==2023.8.0 +xesmf==0.7.1 \ No newline at end of file diff --git a/envs/NASA-Pleiades/python.env b/envs/NASA-Pleiades/python.env new file mode 100644 index 00000000..ca4df0de --- /dev/null +++ b/envs/NASA-Pleiades/python.env @@ -0,0 +1,5 @@ +#============================================================================== +# Load Python environment +#============================================================================== +printf "\nActivating Python environment: ${HOME}/CO2_inversion/.venv/bin/activate\n" +source ${HOME}/CO2_inversion/.venv/bin/activate \ No newline at end of file diff --git a/envs/aws/python.env b/envs/aws/python.env new file mode 100644 index 00000000..25ed2f3d --- /dev/null +++ b/envs/aws/python.env @@ -0,0 +1,6 @@ +#============================================================================== +# Load Python environment +#============================================================================== +printf "\nActivating conda environment: imi_env\n" +source /home/ubuntu/miniconda/etc/profile.d/conda.sh +conda activate geo \ No newline at end of file diff --git a/resources/containers/al2/container_config.yml b/resources/containers/al2/container_config.yml index 857bc4d1..17d5aa19 100644 --- a/resources/containers/al2/container_config.yml +++ b/resources/containers/al2/container_config.yml @@ -3,8 +3,9 @@ ## General RunName: "Test_Permian_1week" +Species: "CH4" isAWS: true -UseSlurm: true +SchedulerType: "slurm" SafeMode: true S3Upload: false @@ -13,8 +14,16 @@ StartDate: 20180501 EndDate: 20180508 SpinupMonths: 1 -## Use blended TROPOMI+GOSAT data (true)? Or use operational TROPOMI data (false)? -BlendedTROPOMI: false +## What satellite data product should be used? Current options are: +## "BlendedTROPOMI" : The dataset generated by Balasus et al. in which +## the TROPOMI data are fit to GOSAT data using ML +## "TROPOMI" : The operational TROPOMI data +## "Other" : Any other dataset +## Currently, only "BlendedTROPOMI" and "TROPOMI" are supported on AWS. If +## "Other" is selected, the user must specify the path where observations are +## located under "Advanced settings" in this file. +SatelliteProduct: "Other" +# BlendedTROPOMI: false ## Use observations over water? Set to false to filter out water observations UseWaterObs: false @@ -128,10 +137,10 @@ DOFSThreshold: 0 ## Resource allocation settings for slurm jobs RequestedCPUs: 8 -RequestedMemory: 8000 +RequestedMemory: "8gb" InversionCPUs: 16 -InversionMemory: 16000 -RequestedTime: "0-24:00" +InversionMemory: "16gb" +RequestedTime: "24:00:00" SchedulerPartition: "debug" ## Max number of simultaneous Jacobian runs from the job array (-1: no limit) @@ -169,7 +178,7 @@ PerturbValueBCs: 10.0 ## Save out hourly diagnostics from GEOS-Chem? ## For use in satellite operators via post-processing -- required for TROPOMI ## inversions -HourlyCH4: true +HourlySpecies: true ## Turn on planeflight diagnostic in GEOS-Chem? ## For use in comparing GEOS-Chem against planeflight data. The path @@ -183,6 +192,9 @@ GOSAT: false TCCON: false AIRS: false +## Use global boundary condition files for initial conditions +UseBCsForRestart: False + ##------------------------------------------------------------------ ## Settings for running on local cluster ##------------------------------------------------------------------ @@ -193,9 +205,8 @@ OutputPath: "/home/al2/imi_output_dir" ## Path to GEOS-Chem input data DataPath: "/home/al2/ExtData" -## Conda environment files -CondaFile: "/etc/bashrc" -CondaEnv: "imi_env" +## Conda environment file +PythonEnv: "/home/ubuntu/integrated_methane_inversion/envs/aws/python.env" ## Download initial restart file from AWS S3? ## NOTE: Must have AWS CLI enabled diff --git a/resources/containers/ubuntu/base-image/install-scripts/slurm/test_slurm.sh b/resources/containers/ubuntu/base-image/install-scripts/slurm/test_slurm.sh index 8dc172fa..36a73923 100644 --- a/resources/containers/ubuntu/base-image/install-scripts/slurm/test_slurm.sh +++ b/resources/containers/ubuntu/base-image/install-scripts/slurm/test_slurm.sh @@ -1,11 +1,11 @@ #!/bin/bash -#SBATCH --job-name=test_job -#SBATCH --output=test_job.out -#SBATCH --partition=debug -#SBATCH --nodes=1 -#SBATCH --mem=100 -#SBATCH --ntasks-per-node=1 -#SBATCH --time=00:05:00 +#SBATCH -J test_job +#SBATCH -o test_job.out +#SBATCH -p debug +#SBATCH -N 1 +#SBATCH --mem 100 +#SBATCH --ntasks-per-node 1 +#SBATCH -t 00:05:00 echo "Hello from Slurm job!" sleep 3 diff --git a/run_imi.sh b/run_imi.sh index 1118e397..44d15f13 100755 --- a/run_imi.sh +++ b/run_imi.sh @@ -1,9 +1,7 @@ #!/bin/bash -#SBATCH -N 1 -#SBATCH -c 1 -#SBATCH --mem=2000 -#SBATCH -o "imi_output.log" +#PBS -l nodes=1,ncpus=1 +#PBS -o "imi_output.log" # This script will run the Integrated Methane Inversion (IMI) with GEOS-Chem. # For documentation, see https://imi.readthedocs.io. @@ -47,23 +45,14 @@ fi # Get the conda environment name and source file # These variables are sourced manually because # we need the python environment to parse the yaml file -CondaEnv=$(grep '^CondaEnv:' ${ConfigFile} | - sed 's/CondaEnv://' | +PythonEnv=$(grep '^PythonEnv:' ${ConfigFile} | + sed 's/PythonEnv://' | sed 's/#.*//' | sed 's/^[[:space:]]*//' | tr -d '"') -CondaFile=$(eval echo $(grep '^CondaFile:' ${ConfigFile} | - sed 's/CondaFile://' | - sed 's/#.*//' | - sed 's/^[[:space:]]*//' | - tr -d '"')) # Load conda/mamba/micromamba e.g. ~/.bashrc -source $CondaFile - -# Activate Conda environment -printf "\nActivating conda environment: ${CondaEnv}\n" -conda activate ${CondaEnv} +source $PythonEnv # Parsing the config file eval $(python src/utilities/parse_yaml.py ${ConfigFile}) @@ -76,7 +65,12 @@ if ! "$isAWS"; then exit 1 else printf "\nLoading GEOS-Chem environment: ${GEOSChemEnv}\n" - source ${GEOSChemEnv} + source ${GEOSChemEnv} + fi + + # If scheduler is PBS, get the list of needed sites + if [[ "$SchedulerType" = "PBS" ]]; then + convert_sbatch_to_pbs fi fi @@ -150,36 +144,31 @@ echo "# TROPOMI/blended processor version(s): ${TROPOMI_PROCESSOR_VERSION}" >>"$ ##======================================================================= ## Download the TROPOMI data ##======================================================================= - # Download TROPOMI or blended dataset from AWS -tropomiCache=${RunDirs}/satellite_data +mkdir -p -v ${RunDirs} +satelliteCache=${RunDirs}/satellite_data if "$isAWS"; then - mkdir -p -v $tropomiCache + mkdir -p -v $satelliteCache - if "$BlendedTROPOMI"; then + if [[ "$SatelliteProduct" == "BlendedTROPOMI" ]]; then downloadScript=src/utilities/download_blended_TROPOMI.py - else + elif [[ "$SatelliteProduct" == "TROPOMI" ]]; then downloadScript=src/utilities/download_TROPOMI.py + else + printf "$SatelliteProduct is not currently supported for download --HON" fi - sbatch --mem $RequestedMemory \ - -c $RequestedCPUs \ - -t $RequestedTime \ - -p $SchedulerPartition \ - -o imi_output.tmp \ - -W $downloadScript $StartDate $EndDate $tropomiCache - wait - cat imi_output.tmp >>${InversionPath}/imi_output.log - rm imi_output.tmp + + submit_job $SchedulerType true $RequestedMemory $RequestedCPUs $RequestedTime $downloadScript $StartDate $EndDate $tropomiCache else # use existing tropomi data and create a symlink to it - if [[ ! -L $tropomiCache ]]; then - ln -s $DataPathTROPOMI $tropomiCache + if [[ ! -L $satelliteCache ]]; then + ln -s $DataPathObs $satelliteCache fi fi # Check to make sure there are no duplicate TROPOMI files (e.g., two files with the same orbit number but a different processor version) -python src/utilities/test_TROPOMI_dir.py $tropomiCache +python src/utilities/test_TROPOMI_dir.py $satelliteCache ##======================================================================= ## Run the setup script diff --git a/src/components/inversion_component/inversion.sh b/src/components/inversion_component/inversion.sh index a6ac7362..28447da4 100644 --- a/src/components/inversion_component/inversion.sh +++ b/src/components/inversion_component/inversion.sh @@ -85,13 +85,7 @@ run_inversion() { InvCPU="${InversionCPUs:-$RequestedCPUs}" InvTime="${InversionTime:-$RequestedTime}" # Execute inversion driver script - sbatch --mem $InvMem \ - -c $InvCPU \ - -t $InvTime \ - -p $SchedulerPartition \ - -W run_inversion.sh $FirstSimSwitch - wait - + submit_job $SchedulerType false $InvMem $InvCPU $InvTime run_inversion.sh $FirstSimSwitch # check if exited with non-zero exit code [ ! -f ".error_status_file.txt" ] || imi_failed $LINENO diff --git a/src/components/jacobian_component/jacobian.sh b/src/components/jacobian_component/jacobian.sh index ab72c0da..c7ca7e63 100644 --- a/src/components/jacobian_component/jacobian.sh +++ b/src/components/jacobian_component/jacobian.sh @@ -20,10 +20,12 @@ setup_jacobian() { cd ${RunDirs} # make dir for jacobian ics/bcs - mkdir -p jacobian_1ppb_ics_bcs/Restarts - mkdir -p jacobian_1ppb_ics_bcs/BCs - OrigBCFile=${fullBCpath}/GEOSChem.BoundaryConditions.${StartDate}_0000z.nc4 - python ${InversionPath}/src/components/jacobian_component/make_jacobian_icbc.py $OrigBCFile ${RunDirs}/jacobian_1ppb_ics_bcs/BCs $StartDate + mkdir -p jacobian_lowbg_ics_bcs/Restarts + if $isRegional; then + mkdir -p jacobian_lowbg_ics_bcs/BCs + OrigBCFile=${fullBCpath}/GEOSChem.BoundaryConditions.${StartDate}_0000z.nc4 + python ${InversionPath}/src/components/jacobian_component/make_jacobian_icbc.py $OrigBCFile ${RunDirs}/jacobian_lowbg_ics_bcs/BCs $StartDate $Species + fi # Create directory that will contain all Jacobian run directories mkdir -p -v jacobian_runs @@ -161,7 +163,7 @@ create_simulation_dir() { # Update settings in HISTORY.rc # Only save out hourly pressure fields to daily files for base run if [[ $x -eq 0 ]] || [[ "$x" = "background" ]]; then - if "$HourlyCH4"; then + if "$HourlySpecies"; then sed -i -e 's/'\''Restart/#'\''Restart/g' \ -e 's/#'\''LevelEdgeDiags/'\''LevelEdgeDiags/g' \ -e 's/LevelEdgeDiags.frequency: 00000100 000000/LevelEdgeDiags.frequency: 00000000 010000/g' \ @@ -170,7 +172,7 @@ create_simulation_dir() { fi # For all other runs, just disable Restarts else - if "$HourlyCH4"; then + if "$HourlySpecies"; then sed -i -e 's/'\''Restart/#'\''Restart/g' HISTORY.rc fi fi @@ -183,8 +185,8 @@ create_simulation_dir() { fi # Create run script from template - sed -e "s:namename:${name}:g" ch4_run.template >${name}.run - rm -f ch4_run.template + sed -e "s:namename:${name}:g" run.template >${name}.run + rm -f run.template chmod 755 ${name}.run ### Turn on observation operators if requested, only for base run @@ -275,13 +277,13 @@ create_simulation_dir() { fi else - # set 1ppb CH4 boundary conditions and restarts for all other perturbation simulations + # set lowbg boundary conditions and restarts for all other perturbation simulations # Note that we use the timecycle flag C to avoid having to make additional files - RestartFile=${RunDirs}/jacobian_1ppb_ics_bcs/Restarts/GEOSChem.Restart.1ppb.${StartDate}_0000z.nc4 - BCFile1ppb=${RunDirs}/jacobian_1ppb_ics_bcs/BCs/GEOSChem.BoundaryConditions.1ppb.${StartDate}_0000z.nc4 - BCSettings1ppb="SpeciesBC_CH4 1980-2021/1-12/1-31/* C xyz 1 CH4 - 1 1" - sed -i -e "s|.*GEOSChem\.BoundaryConditions.*|\* BC_CH4 ${BCFile1ppb} ${BCSettings1ppb}|g" HEMCO_Config.rc - # create symlink to 1ppb restart file + RestartFile=${RunDirs}/jacobian_lowbg_ics_bcs/Restarts/GEOSChem.Restart.lowbg.${StartDate}_0000z.nc4 + BCFilelowbg=${RunDirs}/jacobian_lowbg_ics_bcs/BCs/GEOSChem.BoundaryConditions.lowbg.${StartDate}_0000z.nc4 + BCSettingslowbg="SpeciesBC_CH4 1980-2021/1-12/1-31/* C xyz 1 CH4 - 1 1" + sed -i -e "s|.*GEOSChem\.BoundaryConditions.*|\* BC_CH4 ${BCFilelowbg} ${BCSettingslowbg}|g" HEMCO_Config.rc + # create symlink to lowbg restart file ln -sf $RestartFile Restarts/GEOSChem.Restart.${StartDate}_0000z.nc4 # Also, set emissions to zero for default CH4 tracer by applying ZERO scale factor (id 5) sed -i -e "s|CH4 - 1 500|CH4 5 1 500|g" HEMCO_Config.rc @@ -340,25 +342,32 @@ add_new_tracer() { # Add lines to geoschem_config.yml # Spacing in GcNewLine is intentional GcNewLine='\ - - CH4_'$istr + - ${Species}_'$istr sed -i -e "/$GcPrevLine/a $GcNewLine" geoschem_config.yml - GcPrevLine='- CH4_'$istr + GcPrevLine='- ${Species}_'$istr # Add lines to species_database.yml SpcNextLine='CHBr3:' - SpcNewLines='CH4_'$istr':\n << : *CH4properties\n Background_VV: 1.8e-6\n FullName: Methane' + if [[ $Species = "CH4" ]]; then + bg_vv="1.8e-6" + fullname="Methane" + elif [[ $Species = "CO2" ]]; then + bg_vv="4.0e-6" + fullname="Carbon dioxide" + fi + SpcNewLines='${Species}_'$istr':\n << : *${Species}properties\n Background_VV: ${bg_vv}\n FullName: ${fullname}' sed -i -e "s|$SpcNextLine|$SpcNewLines\n$SpcNextLine|g" species_database.yml # Add lines to HEMCO_Config.yml HcoNewLine1='\ -* SPC_CH4_'$istr' - - - - - - CH4_'$istr' - 1 1' +* SPC_${Species}_'$istr' - - - - - - ${Species}_'$istr' - 1 1' sed -i -e "/$HcoPrevLine1/a $HcoNewLine1" HEMCO_Config.rc - HcoPrevLine1='SPC_CH4_'$istr + HcoPrevLine1='SPC_${Species}_'$istr HcoNewLine2='\ -0 CH4_Emis_Prior_'$istr' - - - - - - CH4_'$istr' '$SFnum' 1 500' +0 ${Species}_Emis_Prior_'$istr' - - - - - - ${Species}_'$istr' '$SFnum' 1 500' sed -i "/$HcoPrevLine2/a $HcoNewLine2" HEMCO_Config.rc - HcoPrevLine2='CH4_'$istr' '$SFnum' 1 500' + HcoPrevLine2='${Species}_'$istr' '$SFnum' 1 500' HcoNewLine3='\ '$SFnum' SCALE_ELEM_'$istr' Perturbations_'$istr'.txt - - - xy count 1' @@ -366,9 +375,9 @@ add_new_tracer() { HcoPrevLine3='SCALE_ELEM_'$istr' Perturbations_'$istr'.txt - - - xy count 1' HcoNewLine4='\ -* BC_CH4_'$istr' - - - - - - CH4_'$istr' - 1 1' +* BC_${Species}_'$istr' - - - - - - ${Species}_'$istr' - 1 1' sed -i -e "/$HcoPrevLine4/a $HcoNewLine4" HEMCO_Config.rc - HcoPrevLine4='BC_CH4_'$istr + HcoPrevLine4='BC_${Species}_'$istr # Add new Perturbations.txt and update for non prior runs cp Perturbations.txt Perturbations_${istr}.txt @@ -415,12 +424,12 @@ run_jacobian() { cd ${RunDirs}/jacobian_runs jacobian_start=$(date +%s) - # create 1ppb restart file + # create lowbg restart file OrigRestartFile=$(readlink ${RunName}_0000/Restarts/GEOSChem.Restart.${StartDate}_0000z.nc4) - python ${InversionPath}/src/components/jacobian_component/make_jacobian_icbc.py $OrigRestartFile ${RunDirs}/jacobian_1ppb_ics_bcs/Restarts $StartDate - cd ${RunDirs}/jacobian_1ppb_ics_bcs/Restarts/ - if [ -f GEOSChem.BoundaryConditions.1ppb.${StartDate}_0000z.nc4 ]; then - mv GEOSChem.BoundaryConditions.1ppb.${StartDate}_0000z.nc4 GEOSChem.Restart.1ppb.${StartDate}_0000z.nc4 + python ${InversionPath}/src/components/jacobian_component/make_jacobian_icbc.py $OrigRestartFile ${RunDirs}/jacobian_lowbg_ics_bcs/Restarts $StartDate + cd ${RunDirs}/jacobian_lowbg_ics_bcs/Restarts/ + if [ -f GEOSChem.BoundaryConditions.lowbg.${StartDate}_0000z.nc4 ]; then + mv GEOSChem.BoundaryConditions.lowbg.${StartDate}_0000z.nc4 GEOSChem.Restart.lowbg.${StartDate}_0000z.nc4 fi cd ${RunDirs}/jacobian_runs set +e @@ -430,11 +439,7 @@ run_jacobian() { source submit_jacobian_simulations_array.sh if "$LognormalErrors"; then - sbatch --mem $RequestedMemory \ - -c $RequestedCPUs \ - -t $RequestedTime \ - -p $SchedulerPartition \ - -W run_bkgd_simulation.sh + submit_job $SchedulerType false $RequestedMemory $RequestedCPUs $RequestedTime run_bkgd_simulation.sh wait fi @@ -463,28 +468,14 @@ run_jacobian() { # Submit prior simulation to job scheduler printf "\n=== SUBMITTING PRIOR SIMULATION ===\n" - sbatch --mem $RequestedMemory \ - -c $RequestedCPUs \ - -t $RequestedTime \ - -p $SchedulerPartition \ - -W run_prior_simulation.sh - wait - cat imi_output.tmp >>${InversionPath}/imi_output.log - rm imi_output.tmp - # check if prior simulation exited with non-zero exit code + submit_job $SchedulerType true $RequestedMemory $RequestedCPUs $RequestedTime run_prior_simulation.sh [ ! -f ".error_status_file.txt" ] || imi_failed $LINENO - printf "=== DONE PRIOR SIMULATION ===\n" # Run the background simulation if lognormal errors enabled if "$LognormalErrors"; then printf "\n=== SUBMITTING BACKGROUND SIMULATION ===\n" - sbatch --mem $RequestedMemory \ - -c $RequestedCPUs \ - -t $RequestedTime \ - -p $SchedulerPartition \ - -W run_bkgd_simulation.sh - wait + submit_job $SchedulerType false $RequestedMemory $RequestedCPUs $RequestedTime run_bkgd_simulation.sh # check if background simulation exited with non-zero exit code [ ! -f ".error_status_file.txt" ] || imi_failed $LINENO printf "=== DONE BACKGROUND SIMULATION ===\n" diff --git a/src/components/jacobian_component/make_jacobian_icbc.py b/src/components/jacobian_component/make_jacobian_icbc.py index e2770dca..f0cf746a 100644 --- a/src/components/jacobian_component/make_jacobian_icbc.py +++ b/src/components/jacobian_component/make_jacobian_icbc.py @@ -2,6 +2,9 @@ import sys import glob import xarray as xr +from src.inversion_scripts.utils import ( + mixing_ratio_conv_factor, +) def check_path_and_get_file(path, pattern="*"): @@ -27,12 +30,11 @@ def check_path_and_get_file(path, pattern="*"): else: raise FileNotFoundError(f"The path '{path}' is neither a file nor a directory.") - -def make_jacobian_icbc(original_file_path, new_file_path, file_date): +def make_jacobian_icbc(original_file_path, new_file_path, file_date, species): """ - This function takes a restart or boundary condition file and - sets the CH4 concentration to 1 ppb for use in the Jacobian - simulations. + This function takes a restart or boundary condition file and + sets the species concentration to 1 mixing ratio unit for use in the + Jacobian simulations. Arguments original_file_path [str] : original restart/bc file path new_file_path [str] : new restart/bc file path @@ -46,19 +48,19 @@ def make_jacobian_icbc(original_file_path, new_file_path, file_date): # determine which data variable to change data_vars = list(orig.data_vars) - if "SpeciesBC_CH4" in data_vars: - key = "SpeciesBC_CH4" - file_prefix = "GEOSChem.BoundaryConditions.1ppb." - elif "SpeciesRst_CH4" in data_vars: - key = "SpeciesRst_CH4" - file_prefix = "GEOSChem.Restart.1ppb." + if f"SpeciesBC_{species}" in data_vars: + key = f"SpeciesBC_{species}" + file_prefix = "GEOSChem.BoundaryConditions.lowbg." + elif f"SpeciesRst_{species}" in data_vars: + key = f"SpeciesRst_{species}" + file_prefix = f"GEOSChem.Restart.lowbg." else: - raise ValueError("No recognized CH4 species found in the file.") - - # set all values to 1 ppb + raise ValueError(f"No recognized {species} species found in the file.") + + # set all values to 1 mixing ratio unit, depending on the species new_restart[key] *= 0.0 - new_restart[key] += 1e-9 - + new_restart[key] += 1/mixing_ratio_conv_factor(species) + write_path = os.path.join(new_file_path, f"{file_prefix}{file_date}_0000z.nc4") # write to new file path diff --git a/src/components/kalman_component/kalman.sh b/src/components/kalman_component/kalman.sh index e924f854..776d9319 100644 --- a/src/components/kalman_component/kalman.sh +++ b/src/components/kalman_component/kalman.sh @@ -121,8 +121,7 @@ run_period() { # Prepare initial (prior) emission scale factors for the current period echo "python path = $PYTHONPATH" - python ${InversionPath}/src/components/kalman_component/prepare_sf.py $ConfigPath $period_i ${RunDirs} $NudgeFactor - wait + python ${InversionPath}/src/components/kalman_component/prepare_sf.py $ConfigPath $period_i ${RunDirs} $NudgeFactor $Species; wait # Dynamically generate state vector for each period if ("$ReducedDimensionStateVector" && "$DynamicKFClustering"); then diff --git a/src/components/kalman_component/prepare_sf.py b/src/components/kalman_component/prepare_sf.py index 68201ed0..81d7419f 100644 --- a/src/components/kalman_component/prepare_sf.py +++ b/src/components/kalman_component/prepare_sf.py @@ -8,7 +8,7 @@ from src.inversion_scripts.utils import get_period_mean_emissions -def prepare_sf(config_path, period_number, base_directory, nudge_factor): +def prepare_sf(config_path, period_number, base_directory, nudge_factor, species): """ Function to prepare scale factors for HEMCO emissions. @@ -68,7 +68,10 @@ def prepare_sf(config_path, period_number, base_directory, nudge_factor): original_emis_ds = get_period_mean_emissions( original_prior_cache, p, periods_csv_path ) - original_emis = original_emis_ds["EmisCH4_Total_ExclSoilAbs"] + if species == "CH4": + original_emis = original_emis_ds["EmisCH4_Total_ExclSoilAbs"] + else: + original_emis = original_emis_ds[f"Emis{Species}_Total"] # Get the gridded posterior for period p gridded_posterior_filename = ( @@ -126,7 +129,7 @@ def prepare_sf(config_path, period_number, base_directory, nudge_factor): ) # Print the current total emissions in the region of interest - emis = get_posterior_emissions(original_emis_ds, sf)["EmisCH4_Total"] + emis = get_posterior_emissions(original_emis_ds, sf, species)[f"Emis{species}_Total"] total_emis = sum_total_emissions(emis, areas, mask) print(f"Total prior emission = {total_emis} Tg a-1") @@ -157,5 +160,6 @@ def prepare_sf(config_path, period_number, base_directory, nudge_factor): period_number = sys.argv[2] base_directory = sys.argv[3] nudge_factor = sys.argv[4] + species = sys.argv[5] - prepare_sf(config_path, period_number, base_directory, nudge_factor) + prepare_sf(config_path, period_number, base_directory, nudge_factor, species) diff --git a/src/components/kalman_component/print_posterior_emissions.py b/src/components/kalman_component/print_posterior_emissions.py index 05465ca7..6e444d96 100644 --- a/src/components/kalman_component/print_posterior_emissions.py +++ b/src/components/kalman_component/print_posterior_emissions.py @@ -41,11 +41,14 @@ def print_posterior_emissions(config_path, period_number, base_directory): # Emissions hemco_emis = hemco_diags posterior_sf = xr.load_dataset(post_sf_path) - posterior_emis_ds = get_posterior_emissions(hemco_emis, posterior_sf) + species = config["Species"] + posterior_emis_ds = get_posterior_emissions( + hemco_emis, posterior_sf, species + ) if "time" in posterior_emis_ds.dims: - posterior_emis = posterior_emis_ds["EmisCH4_Total"].isel(time=0, drop=True) + posterior_emis = posterior_emis_ds[f"Emis{species}_Total"].isel(time=0, drop=True) else: - posterior_emis = posterior_emis_ds["EmisCH4_Total"].squeeze(drop=True) + posterior_emis = posterior_emis_ds[f"Emis{species}_Total"].squeeze(drop=True) total_emis = sum_total_emissions(posterior_emis, areas, mask) # Print diff --git a/src/components/posterior_component/posterior.sh b/src/components/posterior_component/posterior.sh index 7df5f9b4..0fd01e88 100644 --- a/src/components/posterior_component/posterior.sh +++ b/src/components/posterior_component/posterior.sh @@ -66,7 +66,7 @@ setup_posterior() { # Turn on LevelEdgeDiags output # Output daily restarts to avoid trouble at month boundaries - if "$HourlyCH4"; then + if "$HourlySpecies"; then sed -i -e 's/#'\''LevelEdgeDiags/'\''LevelEdgeDiags/g' \ -e 's/LevelEdgeDiags.frequency: 00000100 000000/LevelEdgeDiags.frequency: 00000000 010000/g' \ -e 's/LevelEdgeDiags.duration: 00000100 000000/LevelEdgeDiags.duration: 00000001 000000/g' \ @@ -80,9 +80,9 @@ setup_posterior() { # Create run script from template sed -e "s:namename:${PosteriorName}:g" \ - -e "s:##:#:g" ch4_run.template >${PosteriorName}.run + -e "s:##:#:g" run.template > ${PosteriorName}.run chmod 755 ${PosteriorName}.run - rm -f ch4_run.template + rm -f run.template ### Perform dry run if requested if "$PosteriorDryRun"; then @@ -163,13 +163,8 @@ run_posterior() { # Submit job to job scheduler printf "\n=== SUBMITTING POSTERIOR SIMULATION ===\n" - sbatch --mem $RequestedMemory \ - -c $RequestedCPUs \ - -t $RequestedTime \ - -p $SchedulerPartition \ - -W ${RunName}_Posterior.run - wait - + submit_job $SchedulerType false $RequestedMemory $RequestedCPUs $RequestedTime ${RunName}_Posterior.run + # check if exited with non-zero exit code [ ! -f ".error_status_file.txt" ] || imi_failed $LINENO @@ -206,7 +201,7 @@ run_posterior() { wait printf "\n=== DONE -- setup_gc_cache.py ===\n" - # Sample GEOS-Chem atmosphere with TROPOMI + # Sample GEOS-Chem atmosphere with satellite LonMinInvDomain=$(ncmin lon ${RunDirs}/StateVector.nc) LonMaxInvDomain=$(ncmax lon ${RunDirs}/StateVector.nc) LatMinInvDomain=$(ncmin lat ${RunDirs}/StateVector.nc) @@ -218,14 +213,13 @@ run_posterior() { if "$OptimizeOH"; then nElements=$((nElements + 1)) fi - FetchTROPOMI="False" isPost="True" buildJacobian="False" # fill kf_period with dummy number here kf_period=1 printf "\n=== Calling jacobian.py to sample posterior simulation (without jacobian sensitivity analysis) ===\n" - python ${InversionPath}/src/inversion_scripts/jacobian.py ${ConfigPath} $StartDate_i $EndDate_i $LonMinInvDomain $LonMaxInvDomain $LatMinInvDomain $LatMaxInvDomain $nElements $tropomiCache $BlendedTROPOMI $UseWaterObs $isPost $kf_period $buildJacobian False + python ${InversionPath}/src/inversion_scripts/jacobian.py ${ConfigPath} $StartDate_i $EndDate_i $LonMinInvDomain $LonMaxInvDomain $LatMinInvDomain $LatMaxInvDomain $nElements $Species $satelliteCache $SatelliteProduct $UseWaterObs $isPost $kf_period $buildJacobian False wait printf "\n=== DONE sampling the posterior simulation ===\n\n" posterior_end=$(date +%s) diff --git a/src/components/preview_component/preview.sh b/src/components/preview_component/preview.sh index 60dd701f..33079892 100644 --- a/src/components/preview_component/preview.sh +++ b/src/components/preview_component/preview.sh @@ -30,25 +30,17 @@ run_preview() { config_path=${InversionPath}/${ConfigFile} state_vector_path=${RunDirs}/StateVector.nc preview_dir=${RunDirs}/${runDir} - tropomi_cache=${RunDirs}/satellite_data + satellite_cache=${RunDirs}/satellite_data preview_file=${InversionPath}/src/inversion_scripts/imi_preview.py # Run preview script # If running end to end script with sbatch then use # sbatch to take advantage of multiple cores printf "\nCreating preview plots and statistics... " - if "$UseSlurm"; then + if [[ "$SchedulerType" = "slurm" || "$SchedulerType" = "PBS" ]]; then rm -f .preview_error_status.txt chmod +x $preview_file - sbatch --mem $RequestedMemory \ - -c $RequestedCPUs \ - -t $RequestedTime \ - -p $SchedulerPartition \ - -o imi_output.tmp \ - -W $preview_file $InversionPath $ConfigPath $state_vector_path $preview_dir $tropomi_cache - wait - cat imi_output.tmp >>${InversionPath}/imi_output.log - rm imi_output.tmp + submit_job $SchedulerType true $RequestedMemory $RequestedCPUs $RequestedTime $preview_file $InversionPath $ConfigPath $state_vector_path $preview_dir $Species $satellite_cache # check for any errors [ ! -f ".preview_error_status.txt" ] || imi_failed $LINENO else diff --git a/src/components/setup_component/setup.sh b/src/components/setup_component/setup.sh index ac52346b..f7208b61 100644 --- a/src/components/setup_component/setup.sh +++ b/src/components/setup_component/setup.sh @@ -22,7 +22,10 @@ setup_imi() { SpinupEnd=${StartDate} # Use global boundary condition files for initial conditions - UseBCsForRestart=true + # HON: this won't work as smoothly for CO2. + if [[ $Species == "CH4" ]]; then + UseBCsForRestart=true + fi ##======================================================================= ## Download Boundary Conditions files if requested @@ -126,7 +129,6 @@ setup_imi() { ##======================================================================= ## Create or copy state vector file ##======================================================================= - if "$CreateAutomaticRectilinearStateVectorFile"; then create_statevector else @@ -267,4 +269,4 @@ activate_observations() { sed -i "s/$OLD/$NEW/g" geoschem_config.yml fi -} +} \ No newline at end of file diff --git a/src/components/spinup_component/spinup.sh b/src/components/spinup_component/spinup.sh index 10db8457..390d6be5 100644 --- a/src/components/spinup_component/spinup.sh +++ b/src/components/spinup_component/spinup.sh @@ -45,7 +45,7 @@ setup_spinup() { -e "s|${EndDate}|${SpinupEnd}|g" geoschem_config.yml # Turn on LevelEdgeDiags output - if "$HourlyCH4"; then + if "$HourlySpecies"; then sed -i -e 's/#'\''LevelEdgeDiags/'\''LevelEdgeDiags/g' \ -e 's/LevelEdgeDiags.frequency: 00000100 000000/LevelEdgeDiags.frequency: 00000000 010000/g' \ -e 's/LevelEdgeDiags.duration: 00000100 000000/LevelEdgeDiags.duration: 00000001 000000/g' \ @@ -54,9 +54,13 @@ setup_spinup() { # Create run script from template sed -e "s:namename:${SpinupName}:g" \ +<<<<<<< HEAD + -e "s:##:#:g" run.template > ${SpinupName}.run +======= -e "s:##:#:g" ch4_run.template >${SpinupName}.run +>>>>>>> imi-2.0.0 chmod 755 ${SpinupName}.run - rm -f ch4_run.template + rm -f run.template ### Perform dry run if requested if "$SpinupDryrun"; then @@ -84,12 +88,7 @@ run_spinup() { cd ${RunDirs}/spinup_run # Submit job to job scheduler - sbatch --mem $RequestedMemory \ - -c $RequestedCPUs \ - -t $RequestedTime \ - -p $SchedulerPartition \ - -W ${RunName}_Spinup.run - wait + submit_job $SchedulerType false $RequestedMemory $RequestedCPUs $RequestedTime ${RunName}_Spinup.run # check if exited with non-zero exit code [ ! -f ".error_status_file.txt" ] || imi_failed $LINENO diff --git a/src/components/statevector_component/aggregation.py b/src/components/statevector_component/aggregation.py index 17715844..6813c26a 100755 --- a/src/components/statevector_component/aggregation.py +++ b/src/components/statevector_component/aggregation.py @@ -629,17 +629,17 @@ def update_sv_clusters(config, flat_sensi, orig_sv): config_path = sys.argv[2] state_vector_path = sys.argv[3] preview_dir = sys.argv[4] - tropomi_cache = sys.argv[5] + satellite_cache = sys.argv[5] kf_index = int(sys.argv[6]) if len(sys.argv) > 6 else None config = yaml.load(open(config_path), Loader=yaml.FullLoader) original_clusters = xr.open_dataset(state_vector_path) sensitivity_args = [ - config, - state_vector_path, - preview_dir, - tropomi_cache, - False, + config, + state_vector_path, + preview_dir, + satellite_cache, + False ] # dynamically generate sensitivities with only a diff --git a/src/components/statevector_component/statevector.sh b/src/components/statevector_component/statevector.sh index f90b7eed..e5fc5cf0 100644 --- a/src/components/statevector_component/statevector.sh +++ b/src/components/statevector_component/statevector.sh @@ -53,8 +53,9 @@ reduce_dimension() { # set input variables state_vector_path=${RunDirs}/StateVector.nc native_state_vector_path=${RunDirs}/NativeStateVector.nc - preview_dir=${RunDirs}/preview - tropomi_cache=${RunDirs}/satellite_data + + preview_dir=${RunDirs}/preview_run + satellite_cache=${RunDirs}/satellite_data aggregation_file=${InversionPath}/src/components/statevector_component/aggregation.py if [[ ! -f ${RunDirs}/NativeStateVector.nc ]]; then @@ -67,7 +68,7 @@ reduce_dimension() { fi # conditionally add period_i to python args - python_args=($aggregation_file $InversionPath $ConfigPath $state_vector_path $preview_dir $tropomi_cache) + python_args=($aggregation_file $InversionPath $config_path $state_vector_path $preview_dir $satellite_cache) archive_sv=false if ("$KalmanMode" && "$DynamicKFClustering"); then if [ -n "$period_i" ]; then @@ -77,19 +78,11 @@ reduce_dimension() { fi # if running end to end script with sbatch then use - # sbatch to take advantage of multiple cores - if "$UseSlurm"; then + # sbatch to take advantage of multiple cores + if [[ "$SchedulerType" = "slurm" || "$SchedulerType" = "PBS" ]]; then rm -f .aggregation_error.txt chmod +x $aggregation_file - sbatch --mem $RequestedMemory \ - -c $RequestedCPUs \ - -t $RequestedTime \ - -p $SchedulerPartition \ - -o imi_output.tmp \ - -W "${python_args[@]}" - wait - cat imi_output.tmp >>${InversionPath}/imi_output.log - rm imi_output.tmp + submit_job $SchedulerType true $RequestedMemory $RequestedCPUs $RequestedTime "${python_args[@]}" # check for any errors [ ! -f ".aggregation_error.txt" ] || imi_failed $LINENO else diff --git a/src/components/template_component/template.sh b/src/components/template_component/template.sh index e6cc94bd..c4d39230 100644 --- a/src/components/template_component/template.sh +++ b/src/components/template_component/template.sh @@ -35,6 +35,7 @@ setup_template() { printf "\n Options are GEOSFP or MERRA2.\n" exit 1 fi + if [ "$Res" = "4.0x5.0" ]; then cmd="9\n${metNum}\n1\n2\n${RunDirs}\n${runDir}\nn\n" elif [ "$Res" == "2.0x2.5" ]; then @@ -133,7 +134,7 @@ setup_template() { -e "s:'Met_PFLLSAN:#'Met_PFLLSAN:g" HISTORY.rc # If turned on, save out hourly CH4 concentrations to daily files - if "$HourlyCH4"; then + if "$HourlySpecies"; then sed -i -e 's/SpeciesConc.frequency: 00000100 000000/SpeciesConc.frequency: 00000000 010000/g' \ -e 's/SpeciesConc.duration: 00000100 000000/SpeciesConc.duration: 00000001 000000/g' \ -e 's/SpeciesConc.mode: '\''time-averaged/SpeciesConc.mode: '\''instantaneous/g' HISTORY.rc @@ -143,7 +144,7 @@ setup_template() { rm -f Restarts/GEOSChem.Restart.20190101_0000z.nc4 # Copy template run script - cp ${InversionPath}/src/geoschem_run_scripts/ch4_run.template . + cp ${InversionPath}/src/geoschem_run_scripts/run.template . # Copy input file for applying emissions perturbations via HEMCO cp ${InversionPath}/src/geoschem_run_scripts/Perturbations.txt . diff --git a/src/geoschem_run_scripts/ch4_run.template b/src/geoschem_run_scripts/run.template similarity index 98% rename from src/geoschem_run_scripts/ch4_run.template rename to src/geoschem_run_scripts/run.template index f5b07045..02ccc18f 100755 --- a/src/geoschem_run_scripts/ch4_run.template +++ b/src/geoschem_run_scripts/run.template @@ -1,5 +1,4 @@ #!/bin/bash -##SBATCH -N 1 ##SBATCH --mail-type=END # Set the proper # of threads for OpenMP diff --git a/src/geoschem_run_scripts/run_jacobian_simulations.sh b/src/geoschem_run_scripts/run_jacobian_simulations.sh index 8fa51b7f..b7bc8167 100755 --- a/src/geoschem_run_scripts/run_jacobian_simulations.sh +++ b/src/geoschem_run_scripts/run_jacobian_simulations.sh @@ -1,6 +1,5 @@ #!/bin/bash #SBATCH -J {RunName} -#SBATCH -N 1 ### Run directory RUNDIR=$(pwd -P) diff --git a/src/geoschem_run_scripts/run_prior_simulation.sh b/src/geoschem_run_scripts/run_prior_simulation.sh index 08ba1446..ed2d3c7e 100755 --- a/src/geoschem_run_scripts/run_prior_simulation.sh +++ b/src/geoschem_run_scripts/run_prior_simulation.sh @@ -1,6 +1,5 @@ #!/bin/bash #SBATCH -J {RunName} -#SBATCH -N 1 ### Run directory RUNDIR=$(pwd -P) diff --git a/src/geoschem_run_scripts/submit_jacobian_simulations_array.sh b/src/geoschem_run_scripts/submit_jacobian_simulations_array.sh index baa010b6..1df441a3 100755 --- a/src/geoschem_run_scripts/submit_jacobian_simulations_array.sh +++ b/src/geoschem_run_scripts/submit_jacobian_simulations_array.sh @@ -4,13 +4,26 @@ echo "running {END} jacobian simulations" >> {InversionPath}/imi_output.log # remove error status file if present rm -f .error_status_file.txt -sbatch --array={START}-{END}{JOBS} --mem $RequestedMemory \ --c $RequestedCPUs \ --t $RequestedTime \ --p $SchedulerPartition \ --o imi_output.tmp \ ---open-mode=append \ --W run_jacobian_simulations.sh +if [[ $SchedulerType = "slurm" | $SchedulerType = "tmux" ]]; then + sbatch --array={START}-{END}{JOBS} --mem $RequestedMemory \ + -c $RequestedCPUs \ + -t $RequestedTime \ + -p $SchedulerPartition \ + -o imi_output.tmp \ + --open-mode=append \ + -W run_jacobian_simulations.sh +elif [[ $SchedulerType = "PBS" ]]; then + qsub -J {START}-{END}{JOBS} + -l nodes=1 \ + -l mem="$RequestedMemory" \ + -l ncpus=$RequestedCPUs \ + -l walltime=$RequestedTime \ + -l site=needed=$SitesNeeded \ + -l model=ivy \ + -o imi_output.tmp \ + -j oe -k oe \ + -W run_jacobian_simulations.sh +fi cat imi_output.tmp >> {InversionPath}/imi_output.log -rm imi_output.tmp \ No newline at end of file +rm imi_output.tmp diff --git a/src/inversion_scripts/imi_preview.py b/src/inversion_scripts/imi_preview.py index 76b08e7f..5ccbd975 100755 --- a/src/inversion_scripts/imi_preview.py +++ b/src/inversion_scripts/imi_preview.py @@ -1,8 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# SBATCH -N 1 - import os import sys import yaml @@ -23,34 +21,31 @@ from src.inversion_scripts.utils import ( sum_total_emissions, plot_field, - filter_tropomi, - filter_blended, + read_and_filter_satellite, calculate_area_in_km, calculate_superobservation_error, + species_molar_mass, + mixing_ratio_conv_factor, get_mean_emissions, get_posterior_emissions, ) from joblib import Parallel, delayed -from src.inversion_scripts.operators.TROPOMI_operator import ( - read_tropomi, - read_blended, -) warnings.filterwarnings("ignore", category=FutureWarning) - -def get_TROPOMI_data( - file_path, BlendedTROPOMI, xlim, ylim, startdate_np64, enddate_np64, use_water_obs +def get_satellite_data( + file_path, satellite_str, species, xlim, ylim, startdate_np64, enddate_np64, + use_water_obs ): """ - Returns a dict with the lat, lon, xch4, and albedo_swir observations - extracted from the given tropomi file. Filters are applied to remove + Returns a dict with the lat, lon, xspecies, and albedo_swir observations + extracted from the given satellite file. Filters are applied to remove unsuitable observations Args: file_path : string - path to the tropomi file - BlendedTROPOMI : bool - if True, use blended TROPOMI+GOSAT data + path to the satellite file + satellite_product : str + name of satellite product xlim: list longitudinal bounds for region of interest ylim: list @@ -62,54 +57,38 @@ def get_TROPOMI_data( use_water_obs: bool if True, use observations over water Returns: - tropomi_data: dict + satellite_data: dict dictionary of the extracted values """ - # tropomi data dictionary - tropomi_data = {"lat": [], "lon": [], "xch4": [], "swir_albedo": [], "time": []} + # satellite data dictionary + satellite_data = {"lat": [], "lon": [], species: [], "swir_albedo": []} - # Load the TROPOMI data - assert isinstance(BlendedTROPOMI, bool), "BlendedTROPOMI is not a bool" - if BlendedTROPOMI: - TROPOMI = read_blended(file_path) - else: - TROPOMI = read_tropomi(file_path) - if TROPOMI == None: - print(f"Skipping {file_path} due to error") - return TROPOMI - - if BlendedTROPOMI: - # Only going to consider data within lat/lon/time bounds and without problematic coastal pixels - sat_ind = filter_blended( - TROPOMI, xlim, ylim, startdate_np64, enddate_np64, use_water_obs - ) - else: - # Only going to consider data within lat/lon/time bounds, with QA > 0.5, and with safe surface albedo values - sat_ind = filter_tropomi( - TROPOMI, xlim, ylim, startdate_np64, enddate_np64, use_water_obs - ) + # Load the satellite data + satellite, sat_ind = read_and_filter_satellite( + file_path, satellite_str, startdate_np64, enddate_np64, xlim, ylim, + use_water_obs) # Loop over observations and archive num_obs = len(sat_ind[0]) for k in range(num_obs): lat_idx = sat_ind[0][k] lon_idx = sat_ind[1][k] - tropomi_data["lat"].append(TROPOMI["latitude"][lat_idx, lon_idx]) - tropomi_data["lon"].append(TROPOMI["longitude"][lat_idx, lon_idx]) - tropomi_data["xch4"].append(TROPOMI["methane"][lat_idx, lon_idx]) - tropomi_data["swir_albedo"].append(TROPOMI["swir_albedo"][lat_idx, lon_idx]) - tropomi_data["time"].append(TROPOMI["time"][lat_idx, lon_idx]) + satellite_data["lat"].append(satellite["latitude"][lat_idx, lon_idx]) + satellite_data["lon"].append(satellite["longitude"][lat_idx, lon_idx]) + satellite_data[species].append(satellite[species][lat_idx, lon_idx]) + satellite_data["swir_albedo"].append(satellite["swir_albedo"][lat_idx, lon_idx]) + satellite_data["time"].append(satellite["time"][lat_idx, lon_idx]) - return tropomi_data + return satellite_data def imi_preview( - inversion_path, config_path, state_vector_path, preview_dir, tropomi_cache + inversion_path, config_path, state_vector_path, preview_dir, species, satellite_cache ): """ Function to perform preview Requires preview simulation to have been run already (to generate HEMCO diags) - Requires TROPOMI data to have been downloaded already + Requires satellite data to have been downloaded already """ # ---------------------------------- @@ -133,12 +112,12 @@ def imi_preview( # # Define mask for ROI, to be used below a, df, num_days, prior, outstrings = estimate_averaging_kernel( - config, - state_vector_path, - preview_dir, - tropomi_cache, - preview=True, - kf_index=None, + config, + state_vector_path, + preview_dir, + satellite_cache, + preview=True, + kf_index=None ) mask = state_vector_labels <= last_ROI_element @@ -216,7 +195,7 @@ def imi_preview( ds = df_means.to_xarray() # Prepare plot data for observation counts - df_counts = df.copy(deep=True).drop(["xch4", "swir_albedo"], axis=1) + df_counts = df.copy(deep=True).drop([species, "swir_albedo"], axis=1) df_counts["counts"] = 1 df_counts["lat"] = np.round(df_counts["lat"], 1) # Bin to 0.1x0.1 degrees df_counts["lon"] = np.round(df_counts["lon"], 1) @@ -261,14 +240,14 @@ def imi_preview( xch4_min, xch4_max = dynamic_range(ds["xch4"].values) plot_field( ax, - ds["xch4"], + ds[species], cmap="Spectral_r", plot_type="pcolormesh", vmin=xch4_min, vmax=xch4_max, lon_bounds=None, lat_bounds=None, - title="TROPOMI $X_{CH4}$", + title=f"Satellite $X_{species}$", cbar_label="Column mixing ratio (ppb)", mask=mask if config["isRegional"] else None, only_ROI=False, @@ -401,7 +380,7 @@ def map_sensitivities_to_sv(sensitivities, sv, last_ROI_element): def estimate_averaging_kernel( - config, state_vector_path, preview_dir, tropomi_cache, preview=False, kf_index=None + config, species, state_vector_path, preview_dir, satellite_cache, preview=False, kf_index=None ): """ Estimates the averaging kernel sensitivities using prior emissions @@ -454,7 +433,7 @@ def estimate_averaging_kernel( else: prior_ds = get_mean_emissions(startday, endday, prior_cache) - prior = prior_ds["EmisCH4_Total"] + prior = prior_ds[f"Emis{species}_Total"] # Compute total emissions in the region of interest areas = prior_ds["AREA"] @@ -468,9 +447,9 @@ def estimate_averaging_kernel( # Observations in region of interest # ---------------------------------- - # Paths to tropomi data files - tropomi_files = [f for f in os.listdir(tropomi_cache) if ".nc" in f] - tropomi_paths = [os.path.join(tropomi_cache, f) for f in tropomi_files] + # Paths to satellite data files + satellite_files = [f for f in os.listdir(satellite_cache) if ".nc" in f] + satellite_paths = [os.path.join(satellite_cache, f) for f in satellite_files] # Latitude/longitude bounds of the inversion domain xlim = [float(state_vector.lon.min()), float(state_vector.lon.max())] @@ -486,46 +465,47 @@ def estimate_averaging_kernel( - datetime.timedelta(days=1) ) - # Only consider tropomi files within date range (in case more are present) - tropomi_paths = [ + # Only consider satellite files within date range (in case more are present) + satellite_paths = [ p - for p in tropomi_paths + for p in satellite_paths if int(p.split("____")[1][0:8]) >= int(startday) and int(p.split("____")[1][0:8]) < int(endday) ] - tropomi_paths.sort() + satellite_paths.sort() - # Use blended TROPOMI+GOSAT data or operational TROPOMI data? - BlendedTROPOMI = config["BlendedTROPOMI"] + # What satellite data product to use? + satellite_str = config["SatelliteProduct"] - # Open tropomi files and filter data + # Open satellite files and filter data lat = [] lon = [] - xch4 = [] + xspecies = [] albedo = [] trtime = [] - # Read in and filter tropomi observations (uses parallel processing) + # Read in and filter satellite observations (uses parallel processing) observation_dicts = Parallel(n_jobs=-1)( - delayed(get_TROPOMI_data)( - file_path, - BlendedTROPOMI, - xlim, - ylim, - startdate_np64, + delayed(get_satellite_data)( + file_path, + satellite_str, + species, + xlim, + ylim, + startdate_np64, enddate_np64, - use_water_obs, + use_water_obs ) - for file_path in tropomi_paths + for file_path in satellite_paths ) # Remove any problematic observation dicts (eg. corrupted data file) observation_dicts = list(filter(None, observation_dicts)) - for obs_dict in observation_dicts: - lat.extend(obs_dict["lat"]) - lon.extend(obs_dict["lon"]) - xch4.extend(obs_dict["xch4"]) - albedo.extend(obs_dict["swir_albedo"]) + for dict in observation_dicts: + lat.extend(dict["lat"]) + lon.extend(dict["lon"]) + xspecies.extend(dict[species]) + albedo.extend(dict["swir_albedo"]) trtime.extend(obs_dict["time"]) # Assemble in dataframe @@ -534,7 +514,7 @@ def estimate_averaging_kernel( df["lon"] = lon df["obs_count"] = np.ones(len(lat)) df["swir_albedo"] = albedo - df["xch4"] = xch4 + df[species] = xspecies df["time"] = trtime # Set resolution specific variables @@ -671,11 +651,11 @@ def process(i): p = 101325 # Surface pressure [Pa = kg/m/s2] g = 9.8 # Gravity [m/s2] Mair = 0.029 # Molar mass of air [kg/mol] - Mch4 = 0.01604 # Molar mass of methane [kg/mol] + Mspecies = species_molar_mass(species) # Molar mass of species [kg/mol] alpha = 0.4 # Simple parameterization of turbulence # Change units of total prior emissions - emissions_kgs = emissions * 1e9 / (3600 * 24 * 365) # kg/s from Tg/y + emissions_kgs = emissions * mixing_ratio_conv_factor(species) / (3600 * 24 * 365) # kg/s from Tg/y emissions_kgs_per_m2 = emissions_kgs / np.power( L, 2 ) # kg/m2/s from kg/s, per element @@ -698,7 +678,7 @@ def process(i): calculate_superobservation_error(sO, element) if element >= 1.0 else s_superO_1 for element in P ] - s_superO = np.array(s_superO_p) * 1e-9 # convert to ppb + s_superO = np.array(s_superO_p) / mixing_ratio_conv_factor(species) # convert to mixing ratio # TODO: add eqn number from Estrada et al. 2024 once published # Averaging kernel sensitivity for each grid element @@ -707,7 +687,7 @@ def process(i): # in the state vector element # a is set to 0 where m_superi is 0 m_superi = np.array(m_superi) - k = alpha * (Mair * L * g / (Mch4 * U * p)) + k = alpha * (Mair * L * g / (Mspecies * U * p)) a = sA**2 / (sA**2 + (s_superO / k) ** 2 / (m_superi)) # Places with 0 superobs should be 0 @@ -743,10 +723,11 @@ def process(i): config_path = sys.argv[2] state_vector_path = sys.argv[3] preview_dir = sys.argv[4] - tropomi_cache = sys.argv[5] + species = sys.argv[5] + satellite_cache = sys.argv[6] imi_preview( - inversion_path, config_path, state_vector_path, preview_dir, tropomi_cache + inversion_path, config_path, state_vector_path, preview_dir, species, satellite_cache ) except Exception as err: with open(".preview_error_status.txt", "w") as file1: diff --git a/src/inversion_scripts/invert.py b/src/inversion_scripts/invert.py index 1355c170..47c21990 100644 --- a/src/inversion_scripts/invert.py +++ b/src/inversion_scripts/invert.py @@ -71,7 +71,7 @@ def do_inversion( xlim = [lon_min + degx, lon_max - degx] ylim = [lat_min + degy, lat_max - degy] - # Read output data from jacobian.py (virtual & true TROPOMI columns, Jacobian matrix) + # Read output data from jacobian.py (virtual & true satellite columns, Jacobian matrix) files = glob.glob(f"{jacobian_dir}/*.pkl") files.sort() @@ -113,14 +113,14 @@ def do_inversion( for fi in files: print(fi) - # Load TROPOMI/GEOS-Chem and Jacobian matrix data from the .pkl file + # Load satellite/GEOS-Chem and Jacobian matrix data from the .pkl file dat = load_obj(fi) - # Skip if there aren't any TROPOMI observations on this day + # Skip if there aren't any satellite observations on this day if dat["obs_GC"].shape[0] == 0: continue - # Otherwise, grab the TROPOMI/GEOS-Chem data + # Otherwise, grab the satellite/GEOS-Chem data obs_GC = dat["obs_GC"] # Only consider data within the new latitude and longitude bounds @@ -135,7 +135,7 @@ def do_inversion( if len(ind) == 0: continue - # TROPOMI and GEOS-Chem data within bounds + # satellite and GEOS-Chem data within bounds obs_GC = obs_GC[ind, :] # weight obs_err based on the observation count to prevent overfitting diff --git a/src/inversion_scripts/jacobian.py b/src/inversion_scripts/jacobian.py index d7404123..0dfe5088 100644 --- a/src/inversion_scripts/jacobian.py +++ b/src/inversion_scripts/jacobian.py @@ -9,9 +9,9 @@ import datetime import yaml from src.inversion_scripts.utils import save_obj -from src.inversion_scripts.operators.TROPOMI_operator import ( - apply_average_tropomi_operator, - apply_tropomi_operator, +from src.inversion_scripts.operators.satellite_operator import ( + apply_average_satellite_operator, + apply_satellite_operator, ) from joblib import Parallel, delayed @@ -25,18 +25,19 @@ def apply_operator(operator, params, config): params [dict] : parameters to run the given operator Returns output [dict] : Dictionary with: - - obs_GC : GEOS-Chem and TROPOMI methane data - - TROPOMI methane - - GEOS-Chem methane - - TROPOMI lat, lon - - TROPOMI lat index, lon index + - obs_GC : GEOS-Chem and satellite column data + - satellite columns + - GEOS-Chem columns + - satellite lat, lon + - satellite lat index, lon index If build_jacobian=True, also include: - K : Jacobian matrix """ - if operator == "TROPOMI_average": - return apply_average_tropomi_operator( + if operator == "satellite_average": + return apply_average_satellite_operator( params["filename"], - params["BlendedTROPOMI"], + params["species"], + params["satellite_product"], params["n_elements"], params["gc_startdate"], params["gc_enddate"], @@ -48,10 +49,11 @@ def apply_operator(operator, params, config): config, params["use_water_obs"], ) - elif operator == "TROPOMI": - return apply_tropomi_operator( + elif operator == "satellite": + return apply_satellite_operator( params["filename"], - params["BlendedTROPOMI"], + params["species"], + params["satellite_product"], params["n_elements"], params["gc_startdate"], params["gc_enddate"], @@ -77,13 +79,14 @@ def apply_operator(operator, params, config): latmin = float(sys.argv[6]) latmax = float(sys.argv[7]) n_elements = int(sys.argv[8]) - tropomi_cache = sys.argv[9] - BlendedTROPOMI = sys.argv[10].lower() == "true" - use_water_obs = sys.argv[11].lower() == "true" - isPost = sys.argv[12] - period_i = int(sys.argv[13]) - build_jacobian = sys.argv[14] - viz_prior = sys.argv[15] + species = sys.argv[9] + satellite_cache = sys.argv[10] + satellite_product = sys.argv[11] + use_water_obs = sys.argv[12] + isPost = sys.argv[13] + period_i = int(sys.argv[14]) + build_jacobian = sys.argv[15] + viz_prior = sys.argv[16] # Reformat start and end days for datetime in configuration start = f"{startday[0:4]}-{startday[4:6]}-{startday[6:8]} 00:00:00" @@ -121,8 +124,8 @@ def apply_operator(operator, params, config): print("Start:", start) print("End:", end) - # Get TROPOMI data filenames for the desired date range - allfiles = glob.glob(f"{tropomi_cache}/*.nc") + # Get satellite data filenames for the desired date range + allfiles = glob.glob(f"{satellite_cache}/*.nc") sat_files = [] for index in range(len(allfiles)): filename = allfiles[index] @@ -133,27 +136,28 @@ def apply_operator(operator, params, config): if (strdate >= gc_startdate) and (strdate <= gc_enddate): sat_files.append(filename) sat_files.sort() - print("Found", len(sat_files), "TROPOMI data files.") + print("Found", len(sat_files), "satellite data files.") - # Map GEOS-Chem to TROPOMI observation space + # Map GEOS-Chem to satellite observation space # Also return Jacobian matrix if build_jacobian=True def process(filename): - # Check if TROPOMI file has already been processed + # Check if satellite file has already been processed print("========================") shortname = re.split(r"\/", filename)[-1] print(shortname) date = re.split(r"\.", shortname)[0] - # If not yet processed, run apply_average_tropomi_operator() - if not os.path.isfile(f"{outputdir}/{date}_GCtoTROPOMI.pkl"): - print("Applying TROPOMI operator...") + # If not yet processed, run apply_average_satellite_operator() + if not os.path.isfile(f"{outputdir}/{date}_GCtoSatellite.pkl"): + print("Applying satellite operator...") output = apply_operator( - "TROPOMI_average", + "satellite_average", { "filename": filename, - "BlendedTROPOMI": BlendedTROPOMI, + "species" : species, + "satellite_product": satellite_product, "n_elements": n_elements, "gc_startdate": gc_startdate, "gc_enddate": gc_enddate, @@ -167,12 +171,13 @@ def process(filename): config, ) - # we also save out the unaveraged tropomi operator for visualization purposes + # we also save out the unaveraged satellite operator for visualization purposes viz_output = apply_operator( - "TROPOMI", + "satellite", { "filename": filename, - "BlendedTROPOMI": BlendedTROPOMI, + "species" : species, + "satellite_product": satellite_product, "n_elements": n_elements, "gc_startdate": gc_startdate, "gc_enddate": gc_enddate, @@ -193,8 +198,8 @@ def process(filename): if output["obs_GC"].shape[0] > 0: print("Saving .pkl file") - save_obj(output, f"{outputdir}/{date}_GCtoTROPOMI.pkl") - save_obj(viz_output, f"{vizdir}/{date}_GCtoTROPOMI.pkl") + save_obj(output, f"{outputdir}/{date}_GCtoSatellite.pkl") + save_obj(viz_output, f"{vizdir}/{date}_GCtoSatellite.pkl") return 0 results = Parallel(n_jobs=-1)(delayed(process)(filename) for filename in sat_files) diff --git a/src/inversion_scripts/lognormal_invert.py b/src/inversion_scripts/lognormal_invert.py index 53e5c00a..75e64daf 100644 --- a/src/inversion_scripts/lognormal_invert.py +++ b/src/inversion_scripts/lognormal_invert.py @@ -33,10 +33,10 @@ def lognormal_invert(config, state_vector_filepath, jacobian_sf): convergence_threshold = 5e-3 # Load in the observation and background data - ds = np.load("obs_ch4_tropomi.npz") - y = np.array(ds["obs_tropomi"]) - ds = np.load("gc_ch4_bkgd.npz") - ybkg = np.array(ds["gc_ch4_bkgd"]) + ds = np.load("obs_satellite.npz") + y = np.array(ds["obs_satellite"]) + ds = np.load("gc_bkgd.npz") + ybkg = np.array(ds["gc_bkgd"]) # We only solve using lognormal errors for state vector elements # within the domain of interest, not the buffer elements, the diff --git a/src/inversion_scripts/merge_partial_k.py b/src/inversion_scripts/merge_partial_k.py index 066880c7..4c700973 100644 --- a/src/inversion_scripts/merge_partial_k.py +++ b/src/inversion_scripts/merge_partial_k.py @@ -29,11 +29,11 @@ def merge_partial_k(satdat_dir, lat_bounds, lon_bounds, obs_err, precomp_K): obs_err [float]: default observational error value precomp_K [boolean]: whether or not to use precomputed jacobian matrices """ - # Get observed and GEOS-Chem-simulated TROPOMI columns - files = [f for f in np.sort(os.listdir(satdat_dir)) if "TROPOMI" in f] + # Get observed and GEOS-Chem-simulated satellite columns + files = [f for f in np.sort(os.listdir(satdat_dir)) if "Satellite" in f] # lat = np.array([]) # lon = np.array([]) - tropomi = np.array([]) + satellite = np.array([]) geos_prior = np.array([]) so = np.array([]) for i, f in enumerate(files): @@ -41,12 +41,12 @@ def merge_partial_k(satdat_dir, lat_bounds, lon_bounds, obs_err, precomp_K): # Get paths pth = os.path.join(satdat_dir, f) # Get same file from bc folder - # Load TROPOMI/GEOS-Chem and Jacobian matrix data from the .pkl file + # Load satellite/GEOS-Chem and Jacobian matrix data from the .pkl file obj = load_obj(pth) - # If there aren't any TROPOMI observations on this day, skip + # If there aren't any satellite observations on this day, skip if obj["obs_GC"].shape[0] == 0: continue - # Otherwise, grab the TROPOMI/GEOS-Chem data + # Otherwise, grab the satellite/GEOS-Chem data obs_GC = obj["obs_GC"] # Only consider data within latitude and longitude bounds ind = np.where( @@ -57,10 +57,10 @@ def merge_partial_k(satdat_dir, lat_bounds, lon_bounds, obs_err, precomp_K): ) if len(ind[0]) == 0: # Skip if no data in bounds continue - obs_GC = obs_GC[ind[0], :] # TROPOMI and GEOS-Chem data within bounds + obs_GC = obs_GC[ind[0], :] # satellite and GEOS-Chem data within bounds # concatenate full jacobian, obs, so, and prior - tropomi = np.concatenate((tropomi, obs_GC[:, 0])) + satellite = np.concatenate((satellite, obs_GC[:, 0])) geos_prior = np.concatenate((geos_prior, obs_GC[:, 1])) # read K from reference dir if precomp_K is true @@ -97,8 +97,8 @@ def merge_partial_k(satdat_dir, lat_bounds, lon_bounds, obs_err, precomp_K): gc_ch4_prior = np.asmatrix(geos_prior) - obs_tropomi = np.asmatrix(tropomi) - return gc_ch4_prior, obs_tropomi, K, so + obs_satellite = np.asmatrix(satellite) + return gc_ch4_prior, obs_satellite, K, so if __name__ == "__main__": @@ -109,21 +109,17 @@ def merge_partial_k(satdat_dir, lat_bounds, lon_bounds, obs_err, precomp_K): precomputed_jacobian = sys.argv[4] == "true" # directory containing partial K matrices - # Get observed and GEOS-Chem-simulated TROPOMI columns - files = np.sort(os.listdir(satdat_dir)) - files = [f for f in files if "TROPOMI" in f] - state_vector = xr.load_dataset(state_vector_filepath) state_vector_labels = state_vector["StateVector"] lon_bounds = [np.min(state_vector.lon.values), np.max(state_vector.lon.values)] lat_bounds = [np.min(state_vector.lat.values), np.max(state_vector.lat.values)] # Paths to GEOS/satellite data - gc_ch4_bkgd, obs_tropomi, jacobian_K, so = merge_partial_k( + gc_bkgd, obs_satellite, jacobian_K, so = merge_partial_k( satdat_dir, lat_bounds, lon_bounds, obs_error, precomputed_jacobian ) np.savez("full_jacobian_K.npz", K=jacobian_K) - np.savez("obs_ch4_tropomi.npz", obs_tropomi=obs_tropomi) - np.savez("gc_ch4_bkgd.npz", gc_ch4_bkgd=gc_ch4_bkgd) + np.savez("obs_satellite.npz", obs_satellite=obs_satellite) + np.savez("gc_bkgd.npz", gc_bkgd=gc_bkgd) np.savez("so_super.npz", so=so) diff --git a/src/inversion_scripts/operators/satellite_operator.py b/src/inversion_scripts/operators/satellite_operator.py new file mode 100644 index 00000000..b58f951f --- /dev/null +++ b/src/inversion_scripts/operators/satellite_operator.py @@ -0,0 +1,640 @@ +import numpy as np +import xarray as xr +import pandas as pd +import datetime +from shapely.geometry import Polygon +from src.inversion_scripts.utils import ( + read_and_filter_satellite, + mixing_ratio_conv_factor, + get_strdate, +) +from src.inversion_scripts.operators.operator_utilities import ( + get_gc_lat_lon, + read_all_geoschem, + merge_pressure_grids, + remap, + remap_sensitivities, + get_gridcell_list, + nearest_loc, +) + + +def apply_average_satellite_operator( + filename, + species, + satellite_product, + n_elements, + gc_startdate, + gc_enddate, + xlim, + ylim, + gc_cache, + build_jacobian, + sensi_cache, +): + """ + Apply the averaging satellite operator to map GEOS-Chem data to satellite observation space. + + Arguments + filename [str] : satellite netcdf data file to read + species [str] : The species (CH4 or CO2) to use + satellite_product [str] : "BlendedTROPOMI", "TROPOMI", or "Other", specifying the data used in the inversion. + n_elements [int] : Number of state vector elements + gc_startdate [datetime64] : First day of inversion period, for GEOS-Chem and satellite + gc_enddate [datetime64] : Last day of inversion period, for GEOS-Chem and satellite + xlim [float] : Longitude bounds for simulation domain + ylim [float] : Latitude bounds for simulation domain + gc_cache [str] : Path to GEOS-Chem output data + build_jacobian [log] : Are we trying to map GEOS-Chem sensitivities to satellite observation space? + sensi_cache [str] : If build_jacobian=True, this is the path to the GEOS-Chem sensitivity data + + Returns + output [dict] : Dictionary with: + - obs_GC : GEOS-Chem and satellite data + - satellite gas + - GEOS-Chem gas + - satellite lat, lon + - satellite lat index, lon index + If build_jacobian=True, also include: + - K : Jacobian matrix + """ + + # Read satellite data + satellite, sat_ind = read_and_filter_satellite( + filename, satellite_product, gc_startdate, gc_enddate, xlim, ylim) + + # Number of satellite observations + n_obs = len(sat_ind[0]) + print("Found", n_obs, "satellite observations.") + + # get the lat/lons of gc gridcells + gc_lat_lon = get_gc_lat_lon(gc_cache, gc_startdate) + + # Define time threshold (hour 00 after the inversion period) + date_after_inversion = str(gc_enddate + np.timedelta64(1, 'D'))[:10].replace('-', '') + time_threshold = f"{date_after_inversion}_00" + + # map satellite obs into gridcells and average the observations + # into each gridcell. Only returns gridcells containing observations + obs_mapped_to_gc = average_satellite_observations(satellite, species, gc_lat_lon, sat_ind, time_threshold) + n_gridcells = len(obs_mapped_to_gc) + + if build_jacobian: + # Initialize Jacobian K + jacobian_K = np.zeros([n_gridcells, n_elements], dtype=np.float32) + jacobian_K.fill(np.nan) + + # create list to store the dates/hour of each gridcell + all_strdate = [gridcell["time"] for gridcell in obs_mapped_to_gc] + all_strdate = list(set(all_strdate)) + + # Read GEOS_Chem data for the dates of interest + all_date_gc = read_all_geoschem(all_strdate, gc_cache, build_jacobian, sensi_cache) + + # Initialize array with n_gridcells rows and 5 columns. Columns are + # satellite gas, GEOSChem gas, longitude, latitude, observation counts + obs_GC = np.zeros([n_gridcells, 5], dtype=np.float32) + obs_GC.fill(np.nan) + + # For each gridcell dict with satellite obs: + for i, gridcell_dict in enumerate(obs_mapped_to_gc): + + # Get GEOS-Chem data for the date of the observation: + p_sat = gridcell_dict["p_sat"] + dry_air_subcolumns = gridcell_dict["dry_air_subcolumns"] # mol m-2 + apriori = gridcell_dict["apriori"] # mol m-2 + avkern = gridcell_dict["avkern"] + strdate = gridcell_dict["time"] + GEOSCHEM = all_date_gc[strdate] + + # Get GEOS-Chem pressure edges for the cell + p_gc = GEOSCHEM["PEDGE"][gridcell_dict["iGC"], gridcell_dict["jGC"], :] + # Get GEOS-Chem species for the cell + gc_species = GEOSCHEM[species][gridcell_dict["iGC"], gridcell_dict["jGC"], :] + # Get merged GEOS-Chem/satellite pressure grid for the cell + merged = merge_pressure_grids(p_sat, p_gc) + # Remap GEOS-Chem species to TROPOMI pressure levels + sat_species = remap( + gc_species, + merged["data_type"], + merged["p_merge"], + merged["edge_index"], + merged["first_gc_edge"], + ) # volumetric mixing ratio + # Convert volumetric mixing ratio to mol m-2 + sat_species_molm2 = sat_species * 1/mixing_ratio_conv_factor(species) * dry_air_subcolumns # mol m-2 + # Derive the column-averaged mixing ratio that the satellite would see + # over this ground cell + virtual_satellite = apply_averaging_kernel( + apriori, avkern, sat_species_molm2, dry_air_subcolumns, species) + # Volumetric mixing ratio + + # If building Jacobian matrix from GEOS-Chem perturbation simulation sensitivity data: + if build_jacobian: + # Get GEOS-Chem perturbation sensitivities at this lat/lon, for all vertical levels and state vector elements + sensi_lonlat = GEOSCHEM["Sensitivities"][ + gridcell_dict["iGC"], gridcell_dict["jGC"], :, : + ] + # Map the sensitivities to satellite pressure levels + sat_deltaspecies = remap_sensitivities( + sensi_lonlat, + merged["data_type"], + merged["p_merge"], + merged["edge_index"], + merged["first_gc_edge"], + ) # mixing ratio, unitless + # Tile the satellite averaging kernel + avkern_tiled = np.transpose(np.tile(avkern, (n_elements, 1))) + # Tile the satellite dry air subcolumns + dry_air_subcolumns_tiled = np.transpose( + np.tile(dry_air_subcolumns, (n_elements, 1)) + ) # mol m-2 + # Derive the change in column-averaged mixing ratios that TROPOMI would + # see over this ground cell + jacobian_K[i, :] = np.sum( + avkern_tiled * sat_deltaspecies * dry_air_subcolumns_tiled, 0 + ) / sum( + dry_air_subcolumns + ) # mixing ratio, unitless + + # Save actual and virtual satellite data + obs_GC[i, 0] = gridcell_dict[ + f"{species}" + ] # Actual satellite species column observation + obs_GC[i, 1] = virtual_satellite # Virtual satellite column observation + obs_GC[i, 2] = gridcell_dict["lon_sat"] # satellite longitude + obs_GC[i, 3] = gridcell_dict["lat_sat"] # satellite latitude + obs_GC[i, 4] = gridcell_dict["observation_count"] # observation counts + + # Output + output = {} + + # Always return the coincident satellite and GEOS-Chem data + output["obs_GC"] = obs_GC + + # Optionally return the Jacobian + if build_jacobian: + output["K"] = jacobian_K + + return output + + +def apply_satellite_operator( + filename, + species, + satellite_product, + n_elements, + gc_startdate, + gc_enddate, + xlim, + ylim, + gc_cache, + build_jacobian, + sensi_cache, +): + """ + Apply the satellite operator to map GEOS-Chem species data to satellite observation space. + + Arguments + filename [str] : Satellite netcdf data file to read + species [str] : The species (CH4 or CO2) to use + satellite_product [str] : "BlendedTROPOMI", "TROPOMI", or "Other", specifying the data used in the inversion. + n_elements [int] : Number of state vector elements + gc_startdate [datetime64] : First day of inversion period, for GEOS-Chem and satellite + gc_enddate [datetime64] : Last day of inversion period, for GEOS-Chem and satellite + xlim [float] : Longitude bounds for simulation domain + ylim [float] : Latitude bounds for simulation domain + gc_cache [str] : Path to GEOS-Chem output data + build_jacobian [log] : Are we trying to map GEOS-Chem sensitivities to satellite observation space? + sensi_cache [str] : If build_jacobian=True, this is the path to the GEOS-Chem sensitivity data + + Returns + output [dict] : Dictionary with one or two fields: + - obs_GC : GEOS-Chem and satellite species data + - satellite species + - GEOS-Chem species + - satellite lat, lon + - satellite lat index, lon index + If build_jacobian=True, also include: + - K : Jacobian matrix + """ + + # Read satellite data + satellite, sat_ind = read_and_filter_satellite( + filename, satellite_product, gc_startdate, gc_enddate, xlim, ylim) + + # Number of satellite observations + n_obs = len(sat_ind[0]) + # print("Found", n_obs, "satellite observations.") + + # If need to build Jacobian from GEOS-Chem perturbation simulation sensitivity data: + if build_jacobian: + # Initialize Jacobian K + jacobian_K = np.zeros([n_obs, n_elements], dtype=np.float32) + jacobian_K.fill(np.nan) + + # Initialize a list to store the dates we want to look at + all_strdate = [] + + # Define time threshold (hour 00 after the inversion period) + date_after_inversion = str(gc_enddate + np.timedelta64(1, 'D'))[:10].replace('-', '') + time_threshold = f"{date_after_inversion}_00" + + # For each satellite observation + for k in range(n_obs): + # Get the date and hour + iSat = sat_ind[0][k] # lat index + jSat = sat_ind[1][k] # lon index + time = pd.to_datetime(str(satellite["time"][iSat,jSat])) + strdate = get_strdate(time, time_threshold) + all_strdate.append(strdate) + all_strdate = list(set(all_strdate)) + + # Read GEOS_Chem data for the dates of interest + all_date_gc = read_all_geoschem(all_strdate, gc_cache, build_jacobian, sensi_cache) + + # Initialize array with n_obs rows and 6 columns. Columns are satellite + # mixing ratio, GEOSChem mixing ratio, longitude, latitude, II, JJ + obs_GC = np.zeros([n_obs, 6], dtype=np.float32) + obs_GC.fill(np.nan) + + # For each satellite observation: + for k in range(n_obs): + + # Get GEOS-Chem data for the date of the observation: + iSat = sat_ind[0][k] + jSat = sat_ind[1][k] + p_sat = satellite["pressures"][iSat, jSat, :] + dry_air_subcolumns = satellite["dry_air_subcolumns"][iSat, jSat, :] # mol m-2 + apriori = satellite["profile_apriori"][iSat, jSat, :] # mol m-2 + avkern = satellite["column_AK"][iSat, jSat, :] + time = pd.to_datetime(str(satellite["time"][iSat,jSat])) + strdate = get_strdate(time, time_threshold) + GEOSCHEM = all_date_gc[strdate] + dlon = np.median(np.diff(GEOSCHEM["lon"])) # GEOS-Chem lon resolution + dlat = np.median(np.diff(GEOSCHEM["lat"])) # GEOS-Chem lon resolution + + # Find GEOS-Chem lats & lons closest to the corners of the satellite pixel + longitude_bounds = satellite["longitude_bounds"][iSat, jSat, :] + latitude_bounds = satellite["latitude_bounds"][iSat, jSat, :] + corners_lon_index = [] + corners_lat_index = [] + for l in range(4): + iGC = nearest_loc(longitude_bounds[l], GEOSCHEM["lon"], tolerance=max(dlon,0.5)) + jGC = nearest_loc(latitude_bounds[l], GEOSCHEM["lat"], tolerance=max(dlat,0.5)) + corners_lon_index.append(iGC) + corners_lat_index.append(jGC) + # If the tolerance in nearest_loc() is not satisfied, skip the observation + if np.nan in corners_lon_index + corners_lat_index: + continue + # Get lat/lon indexes and coordinates of GEOS-Chem grid cells closest to the satellite corners + ij_GC = [(x, y) for x in set(corners_lon_index) for y in set(corners_lat_index)] + gc_coords = [(GEOSCHEM["lon"][i], GEOSCHEM["lat"][j]) for i, j in ij_GC] + + # Compute the overlapping area between the satellite pixel and GEOS-Chem grid cells it touches + overlap_area = np.zeros(len(gc_coords)) + # Polygon representing satellite pixel + polygon_satellite = Polygon(np.column_stack((longitude_bounds, latitude_bounds))) + # For each GEOS-Chem grid cell that touches the satellite pixel: + for gridcellIndex in range(len(gc_coords)): + # Define polygon representing the GEOS-Chem grid cell + coords = gc_coords[gridcellIndex] + geoschem_corners_lon = [ + coords[0] - dlon / 2, + coords[0] + dlon / 2, + coords[0] + dlon / 2, + coords[0] - dlon / 2, + ] + geoschem_corners_lat = [ + coords[1] - dlat / 2, + coords[1] - dlat / 2, + coords[1] + dlat / 2, + coords[1] + dlat / 2, + ] + # If this is a global 2.0 x 2.5 grid, extend the eastern-most grid cells to 180 degrees + if (dlon == 2.5) & (coords[0] == 177.5): + for i in [1,2]: + geoschem_corners_lon[i] += dlon / 2 + + polygon_geoschem = Polygon( + np.column_stack((geoschem_corners_lon, geoschem_corners_lat)) + ) + # Calculate overlapping area as the intersection of the two polygons + if polygon_geoschem.intersects(polygon_satellite): + overlap_area[gridcellIndex] = polygon_satellite.intersection( + polygon_geoschem + ).area + + # If there is no overlap between GEOS-Chem and satellite, skip to next observation: + if sum(overlap_area) == 0: + continue + + # ======================================================= + # Map GEOS-Chem to satellite observation space + # ======================================================= + + # Otherwise, initialize satellite virtual mixing ratios and virtual + # sensitivity as zero + area_weighted_virtual_satellite = 0 # virtual satellite mixing ratio + area_weighted_virtual_satellite_sensitivity = 0 # virtual satellite sensitivity + + # For each GEOS-Chem grid cell that touches the satellite pixel: + for gridcellIndex in range(len(gc_coords)): + + # Get GEOS-Chem lat/lon indices for the cell + iGC, jGC = ij_GC[gridcellIndex] + + # Get GEOS-Chem pressure edges for the cell + p_gc = GEOSCHEM["PEDGE"][iGC, jGC, :] + + # Get GEOS-Chem mixing ratios for the cell + gc_species = GEOSCHEM[species][iGC, jGC, :] + + # Get merged GEOS-Chem/satellite pressure grid for the cell + merged = merge_pressure_grids(p_sat, p_gc) + + # Remap GEOS-Chem mixing ratios to satellite pressure levels + sat_species = remap( + gc_species, + merged["data_type"], + merged["p_merge"], + merged["edge_index"], + merged["first_gc_edge"], + ) # ppb + + # Convert volumetric mixing ratio to mol m-2 + sat_species_molm2 = sat_species * 1/mixing_ratio_conv_factor(species) * dry_air_subcolumns # mol m-2 + + # Derive the column-averaged mixing ratio that satellite would + # see over this ground cell + virtual_satellite_gridcellIndex = apply_averaging_kernel( + apriori, avkern, sat_species_molm2, dry_air_subcolumns, species + ) # Volumetric mixing ratio + + # Weight by overlapping area (to be divided out later) and add to sum + area_weighted_virtual_satellite += ( + overlap_area[gridcellIndex] * virtual_satellite_gridcellIndex + ) # ppb m2 + + # If building Jacobian matrix from GEOS-Chem perturbation simulation sensitivity data: + if build_jacobian: + + # Get GEOS-Chem perturbation sensitivities at this lat/lon, for all vertical levels and state vector elements + sensi_lonlat = GEOSCHEM["Sensitivities"][iGC, jGC, :, :] + + # Map the sensitivities to satellite pressure levels + sat_deltaspecies = remap_sensitivities( + sensi_lonlat, + merged["data_type"], + merged["p_merge"], + merged["edge_index"], + merged["first_gc_edge"], + ) # mixing ratio, unitless + + # Tile the satellite averaging kernel + avkern_tiled = np.transpose(np.tile(avkern, (n_elements, 1))) + + # Tile the satellite dry air subcolumns + dry_air_subcolumns_tiled = np.transpose( + np.tile(dry_air_subcolumns, (n_elements, 1)) + ) # mol m-2 + + # Derive the change in column-averaged mixing ratio that the + # satellite would see over this ground cell + satellite_sensitivity_gridcellIndex = np.sum( + avkern_tiled * sat_deltaspecies * dry_air_subcolumns_tiled, 0 + ) / sum( + dry_air_subcolumns + ) # mixing ratio, unitless + + # Weight by overlapping area (to be divided out later) and add to sum + area_weighted_virtual_satellite_sensitivity += ( + overlap_area[gridcellIndex] * satellite_sensitivity_gridcellIndex + ) # m2 + + # Compute virtual satellite observation as weighted mean by overlapping area + # i.e., need to divide out area [m2] from the previous step + virtual_satellite = area_weighted_virtual_satellite / sum(overlap_area) + + # For global inversions, area of overlap should equal area of satellite pixel + # This is because the GEOS-Chem grid is continuous + if dlon > 2.0: + assert abs(sum(overlap_area)-polygon_satellite.area)/polygon_satellite.area < 0.01, f"ERROR: overlap area ({sum(overlap_area)}) /= satellite pixel area ({polygon_satellite.area})" + + # Save actual and virtual satellite data + obs_GC[k, 0] = satellite[species][ + iSat, jSat + ] # Actual satellite mixing ratio column observation + obs_GC[k, 1] = virtual_satellite # Virtual satellite mixing ratio column observation + obs_GC[k, 2] = satellite["longitude"][iSat, jSat] # satellite longitude + obs_GC[k, 3] = satellite["latitude"][iSat, jSat] # satellite latitude + obs_GC[k, 4] = iSat # satellite index of longitude + obs_GC[k, 5] = jSat # satellite index of latitude + + if build_jacobian: + # Compute satellite sensitivity as weighted mean by overlapping area + # i.e., need to divide out area [m2] from the previous step + jacobian_K[k, :] = area_weighted_virtual_satellite_sensitivity / sum( + overlap_area + ) + + # Output + output = {} + + # Always return the coincident satellite and GEOS-Chem data + output["obs_GC"] = obs_GC + + # Optionally return the Jacobian + if build_jacobian: + output["K"] = jacobian_K + + return output + + +def average_satellite_observations(satellite, species, gc_lat_lon, sat_ind, time_threshold): + """ + Map TROPOMI observations into appropriate gc gridcells. Then average all + observations within a gridcell for processing. Use area weighting if + observation overlaps multiple gridcells. + + Arguments + satellite [dict] : Dict of satellite data + species [str] : Name of species analyzed (CO2 or CH4) + gc_lat_lon [list] : list of dictionaries containing gc gridcell info + sat_ind [int] : index list of satellite data that passes filters + + Returns + output [dict[]] : flat list of dictionaries the following fields: + - lat : gridcell latitude + - lon : gridcell longitude + - iGC : longitude index value + - jGC : latitude index value + - lat_sat : averaged tropomi latitude + - lon_sat : averaged tropomi longitude + - overlap_area : averaged overlap area with gridcell + - p_sat : averaged pressure for sat + - dry_air_subcolumns : averaged + - apriori : averaged + - avkern : averaged average kernel + - time : averaged time + - $species : averaged species + - observation_count : number of observations averaged in cell + - observation_weights : area weights for the observation + + """ + n_obs = len(sat_ind[0]) + # print("Found", n_obs, "satellite observations.") + gc_lats = gc_lat_lon["lat"] + gc_lons = gc_lat_lon["lon"] + dlon = np.median(np.diff(gc_lat_lon["lon"])) # GEOS-Chem lon resolution + dlat = np.median(np.diff(gc_lat_lon["lat"])) # GEOS-Chem lon resolution + gridcell_dicts = get_gridcell_list(gc_lons, gc_lats) + + for k in range(n_obs): + iSat = sat_ind[0][k] # lat index + jSat = sat_ind[1][k] # lon index + + # Find GEOS-Chem lats & lons closest to the corners of the satellite pixel + longitude_bounds = satellite["longitude_bounds"][iSat, jSat, :] + latitude_bounds = satellite["latitude_bounds"][iSat, jSat, :] + corners_lon_index = [] + corners_lat_index = [] + + for l in range(4): + iGC = nearest_loc(longitude_bounds[l], gc_lons, tolerance=max(dlon,0.5)) + jGC = nearest_loc(latitude_bounds[l], gc_lats, tolerance=max(dlat,0.5)) + corners_lon_index.append(iGC) + corners_lat_index.append(jGC) + + # If the tolerance in nearest_loc() is not satisfied, skip the observation + if np.nan in corners_lon_index + corners_lat_index: + continue + + # Get lat/lon indexes and coordinates of GEOS-Chem grid cells closest to the satellite corners + ij_GC = [(x, y) for x in set(corners_lon_index) for y in set(corners_lat_index)] + gc_coords = [(gc_lons[i], gc_lats[j]) for i, j in ij_GC] + + # Compute the overlapping area between the satellite pixel and GEOS-Chem grid cells it touches + overlap_area = np.zeros(len(gc_coords)) + + # Polygon representing satellite pixel + polygon_satellite = Polygon(np.column_stack((longitude_bounds, latitude_bounds))) + for gridcellIndex in range(len(gc_coords)): + # Define polygon representing the GEOS-Chem grid cell + coords = gc_coords[gridcellIndex] + geoschem_corners_lon = [ + coords[0] - dlon / 2, + coords[0] + dlon / 2, + coords[0] + dlon / 2, + coords[0] - dlon / 2, + ] + geoschem_corners_lat = [ + coords[1] - dlat / 2, + coords[1] - dlat / 2, + coords[1] + dlat / 2, + coords[1] + dlat / 2, + ] + polygon_geoschem = Polygon( + np.column_stack((geoschem_corners_lon, geoschem_corners_lat)) + ) + # Calculate overlapping area as the intersection of the two polygons + if polygon_geoschem.intersects(polygon_satellite): + overlap_area[gridcellIndex] = polygon_satellite.intersection( + polygon_geoschem + ).area + # If there is no overlap between GEOS-Chem and satellite, skip to next observation: + total_overlap_area = sum(overlap_area) + + # iterate through any gridcells with observation overlap + # weight each observation if observation extent overlaps with multiple + # gridcells + for index, overlap in enumerate(overlap_area): + if not overlap == 0: + # get the matching dictionary for the gridcell with the overlap + gridcell_dict = gridcell_dicts[ij_GC[index][0]][ij_GC[index][1]] + gridcell_dict["lat_sat"].append(satellite["latitude"][iSat, jSat]) + gridcell_dict["lon_sat"].append(satellite["longitude"][iSat, jSat]) + gridcell_dict["overlap_area"].append(overlap) + gridcell_dict["p_sat"].append(satellite["pressures"][iSat, jSat, :]) + gridcell_dict["dry_air_subcolumns"].append( + satellite["dry_air_subcolumns"][iSat, jSat, :] + ) + gridcell_dict["apriori"].append( + satellite["profile_apriori"][iSat, jSat, :] + ) + gridcell_dict["avkern"].append(satellite["column_AK"][iSat, jSat, :]) + gridcell_dict[ + "time" + ].append( # convert times to epoch time to make taking the mean easier + int(pd.to_datetime(str(satellite["time"][iSat,jSat])).strftime("%s")) + ) + gridcell_dict[species].append( + satellite[species][iSat, jSat] + ) # Actual satellite mixing ratio column observation + # record weights for averaging later + gridcell_dict["observation_weights"].append( + overlap / total_overlap_area + ) + # increment the observation count based on overlap area + gridcell_dict["observation_count"] += overlap / total_overlap_area + + # filter out gridcells without any observations + gridcell_dicts = [ + item for item in gridcell_dicts.flatten() if item["observation_count"] > 0 + ] + # weighted average observation values for each gridcell + for gridcell_dict in gridcell_dicts: + gridcell_dict["lat_sat"] = np.average( + gridcell_dict["lat_sat"], weights=gridcell_dict["observation_weights"], + ) + gridcell_dict["lon_sat"] = np.average( + gridcell_dict["lon_sat"], weights=gridcell_dict["observation_weights"], + ) + gridcell_dict["overlap_area"] = np.average( + gridcell_dict["overlap_area"], weights=gridcell_dict["observation_weights"], + ) + gridcell_dict[species] = np.average( + gridcell_dict[species], weights=gridcell_dict["observation_weights"], + ) + # take mean of epoch times and then convert gc filename time string + gridcell_dict["time"] = get_strdate(time, time_threshold) + # for multi-dimensional arrays, we only take the average across the 0 axis + gridcell_dict["p_sat"] = np.average( + gridcell_dict["p_sat"], + axis=0, + weights=gridcell_dict["observation_weights"], + ) + gridcell_dict["dry_air_subcolumns"] = np.average( + gridcell_dict["dry_air_subcolumns"], + axis=0, + weights=gridcell_dict["observation_weights"], + ) + gridcell_dict["apriori"] = np.average( + gridcell_dict["apriori"], + axis=0, + weights=gridcell_dict["observation_weights"], + ) + gridcell_dict["avkern"] = np.average( + gridcell_dict["avkern"], + axis=0, + weights=gridcell_dict["observation_weights"], + ) + return gridcell_dicts + + +def apply_averaging_kernel( + apriori, + avkern, + sat_species_molm2, + dry_air_subcolumns, + species +): + # Derive the column-averaged mixing ratio that the satellite would see + # over this ground cell using eq. 46 from TROPOMI Methane ATBD, + # Hasekamp et al. 2019 + virtual_satellite = ( + sum(apriori + avkern * (sat_species_molm2 - apriori)) + / sum(dry_air_subcolumns) + * mixing_ratio_conv_factor(species) + ) # volumetric mixing ratio + return virtual_satellite diff --git a/src/inversion_scripts/run_inversion.sh b/src/inversion_scripts/run_inversion.sh index e66f092f..65d15bb7 100755 --- a/src/inversion_scripts/run_inversion.sh +++ b/src/inversion_scripts/run_inversion.sh @@ -1,6 +1,5 @@ #!/bin/bash -#SBATCH -N 1 #SBATCH -o run_inversion_%j.out ##======================================================================= @@ -57,7 +56,7 @@ GCDir="./data_geoschem" GCVizDir="./data_geoschem_prior" JacobianDir="./data_converted" sensiCache="./data_sensitivities" -tropomiCache="${OutputPath}/${RunName}/satellite_data" +satelliteCache="${OutputPath}/${RunName}/satellite_data" period_i={PERIOD} # For Kalman filter: assume first inversion period (( period_i = 1 )) by default @@ -153,11 +152,11 @@ else fi -python jacobian.py ${invPath}/${configFile} $StartDate $EndDate $LonMinInvDomain $LonMaxInvDomain $LatMinInvDomain $LatMaxInvDomain $nElements $tropomiCache $BlendedTROPOMI $UseWaterObs $isPost $period_i $buildJacobian False; wait +python jacobian.py ${invPath}/${configFile} $StartDate $EndDate $LonMinInvDomain $LonMaxInvDomain $LatMinInvDomain $LatMaxInvDomain $nElements $Species $satelliteCache $SatelliteProduct $UseWaterObs $isPost $period_i $buildJacobian False; wait if "$LognormalErrors"; then # for lognormal error visualization of the prior we sample the prior run # without constructing the jacobian matrix - python jacobian.py ${invPath}/${configFile} $StartDate $EndDate $LonMinInvDomain $LonMaxInvDomain $LatMinInvDomain $LatMaxInvDomain $nElements $tropomiCache $BlendedTROPOMI $UseWaterObs $isPost $period_i False True; wait + python jacobian.py ${invPath}/${configFile} $StartDate $EndDate $LonMinInvDomain $LonMaxInvDomain $LatMinInvDomain $LatMaxInvDomain $nElements $Species $satelliteCache $SatelliteProduct $UseWaterObs $isPost $period_i False True; wait fi printf " DONE -- jacobian.py\n\n" diff --git a/src/inversion_scripts/setup_gc_cache.py b/src/inversion_scripts/setup_gc_cache.py index c6b536ce..f57d1487 100644 --- a/src/inversion_scripts/setup_gc_cache.py +++ b/src/inversion_scripts/setup_gc_cache.py @@ -7,8 +7,8 @@ def setup_gc_cache(startday, endday, gc_source_path, gc_destination_path): """ This script sets up a directory containing hourly GEOS-Chem output diagnostics - files. The hourly files are convenient for computing virtual TROPOMI columns - from the GEOS-Chem simulated atmosphere (to compare with the real TROPOMI columns). + files. The hourly files are convenient for computing virtual satellite columns + from the GEOS-Chem simulated atmosphere (to compare with the real satellite columns). Arguments startday [str] : First day of inversion period; formatted YYYYMMDD diff --git a/src/inversion_scripts/utils.py b/src/inversion_scripts/utils.py index 1a8979aa..d152ae32 100644 --- a/src/inversion_scripts/utils.py +++ b/src/inversion_scripts/utils.py @@ -1,4 +1,5 @@ import os +import sys import pickle from datetime import datetime, timedelta import numpy as np @@ -42,6 +43,26 @@ def zero_pad_num_hour(n): return nstr +def mixing_ratio_conv_factor(species): + if species == "CH4": + return 1e9 + elif species == "CO2": + return 1e6 + else: + raise ValueError(f"{species} is not recognized. Please add a line to " + "mixing_ratio_conv_factor in src/inversion_scripts/utils.py") + + +def species_molar_mass(species): + if species == "CH4": + M = 0.01604 # Molar mass of methane [kg/mol] + elif species == "CO2": + M = 0.04401 + else: + raise ValueError(f"{species} is not recognized. Please add a line to " + "species_molar_mass in src/inversion_scripts/utils.py") + + def sum_total_emissions(emissions, areas, mask): """ Function to sum total emissions across the region of interest. @@ -442,8 +463,196 @@ def calculate_superobservation_error(sO, p): s_super = np.sqrt(sO**2 * (((1 - r_retrieval) / p) + r_retrieval) + s_transport**2) return s_super +def read_tropomi(filename): + """ + Read TROPOMI data and save important variables to dictionary. + + Arguments + filename [str] : TROPOMI netcdf data file to read + + Returns + dat [dict] : Dictionary of important variables from TROPOMI: + - CH4 + - Latitude + - Longitude + - QA value + - UTC time + - Time (utc time reshaped for orbit) + - Averaging kernel + - SWIR albedo + - NIR albedo + - Blended albedo + - CH4 prior profile + - Dry air subcolumns + - Latitude bounds + - Longitude bounds + - Vertical pressure profile + """ + + # Initialize dictionary for TROPOMI data + dat = {} + + # Catch read errors in any of the variables + try: + # Store methane, QA, lat, lon, and time + with xr.open_dataset(filename, group="PRODUCT") as tropomi_data: + dat["CH4"] = tropomi_data["methane_mixing_ratio_bias_corrected"].values[0, :, :] + dat["qa_value"] = tropomi_data["qa_value"].values[0, :, :] + dat["longitude"] = tropomi_data["longitude"].values[0, :, :] + dat["latitude"] = tropomi_data["latitude"].values[0, :, :] + + utc_str = tropomi_data["time_utc"].values[0,:] + utc_str = np.array([d.replace("Z","") for d in utc_str]).astype("datetime64[ns]") + dat["time"] = np.repeat(utc_str[:, np.newaxis], dat["CH4"].shape[1], axis=1) + + # Store column averaging kernel, SWIR and NIR surface albedo + with xr.open_dataset(filename, group="PRODUCT/SUPPORT_DATA/DETAILED_RESULTS") as tropomi_data: + dat["column_AK"] = tropomi_data["column_averaging_kernel"].values[0, :, :, ::-1] + dat["swir_albedo"] = tropomi_data["surface_albedo_SWIR"].values[0, :, :] + dat["nir_albedo"] = tropomi_data["surface_albedo_NIR"].values[0, :, :] + dat["blended_albedo"] = 2.4 * dat["nir_albedo"] - 1.13 * dat["swir_albedo"] + + # Store methane prior profile, dry air subcolumns + with xr.open_dataset(filename, group="PRODUCT/SUPPORT_DATA/INPUT_DATA") as tropomi_data: + dat["profile_apriori"] = tropomi_data["methane_profile_apriori"].values[0, :, :, ::-1] # mol m-2 + dat["dry_air_subcolumns"] = tropomi_data["dry_air_subcolumns"].values[0, :, :, ::-1] # mol m-2 + dat["surface_classification"] = (tropomi_data["surface_classification"].values[0, :, :].astype("uint8") & 0x03).astype(int) + + # Also get pressure interval and surface pressure for use below + pressure_interval = (tropomi_data["pressure_interval"].values[0, :, :] / 100) # Pa -> hPa + surface_pressure = (tropomi_data["surface_pressure"].values[0, :, :] / 100) # Pa -> hPa + + # Store latitude and longitude bounds for pixels + with xr.open_dataset(filename, group="PRODUCT/SUPPORT_DATA/GEOLOCATIONS") as tropomi_data: + dat["longitude_bounds"] = tropomi_data["longitude_bounds"].values[0, :, :, :] + dat["latitude_bounds"] = tropomi_data["latitude_bounds"].values[0, :, :, :] + + # Store vertical pressure profile + n1 = dat["CH4"].shape[0] # length of along-track dimension (scanline) of retrieval field + n2 = dat["CH4"].shape[1] # length of across-track dimension (ground_pixel) of retrieval field + pressures = np.full([n1, n2, 12 + 1], np.nan, dtype=np.float32) + for i in range(12 + 1): + pressures[:, :, i] = surface_pressure - i * pressure_interval + dat["pressures"] = pressures + + # Return an error if any of the variables were not read correctly + except Exception as e: + print(f"Error opening {filename}: {e}") + return None + + return dat + +def read_blended(filename): + """ + Read Blended TROPOMI+GOSAT data and save important variables to dictionary. + Arguments + filename [str] : Blended TROPOMI+GOSAT netcdf data file to read + Returns + dat [dict] : Dictionary of important variables from Blended TROPOMI+GOSAT: + - CH4 + - Latitude + - Longitude + - Time (utc time reshaped for orbit) + - Averaging kernel + - SWIR albedo + - NIR albedo + - Blended albedo + - CH4 prior profile + - Dry air subcolumns + - Latitude bounds + - Longitude bounds + - Surface classification + - Chi-Square for SWIR + - Vertical pressure profile + """ + assert "BLND" in filename, f"BLND not in filename {filename}, but a blended function is being used" + + try: + # Initialize dictionary for Blended TROPOMI+GOSAT data + dat = {} + + # Extract data from netCDF file to our dictionary + with xr.open_dataset(filename) as blended_data: + + dat["CH4"] = blended_data["methane_mixing_ratio_blended"].values[:] + dat["longitude"] = blended_data["longitude"].values[:] + dat["latitude"] = blended_data["latitude"].values[:] + dat["column_AK"] = blended_data["column_averaging_kernel"].values[:, ::-1] + dat["swir_albedo"] = blended_data["surface_albedo_SWIR"][:] + dat["nir_albedo"] = blended_data["surface_albedo_NIR"].values[:] + dat["blended_albedo"] = 2.4 * dat["nir_albedo"] - 1.13 * dat["swir_albedo"] + dat["profile_apriori"] = blended_data["methane_profile_apriori"].values[:, ::-1] + dat["dry_air_subcolumns"] = blended_data["dry_air_subcolumns"].values[:, ::-1] + dat["longitude_bounds"] = blended_data["longitude_bounds"].values[:] + dat["latitude_bounds"] = blended_data["latitude_bounds"].values[:] + dat["surface_classification"] = (blended_data["surface_classification"].values[:].astype("uint8") & 0x03).astype(int) + dat["chi_square_SWIR"] = blended_data["chi_square_SWIR"].values[:] + + # Remove "Z" from time so that numpy doesn't throw a warning + utc_str = blended_data["time_utc"].values[:] + dat["time"] = np.array([d.replace("Z","") for d in utc_str]).astype("datetime64[ns]") + + # Need to calculate the pressure for the 13 TROPOMI levels (12 layer edges) + pressure_interval = (blended_data["pressure_interval"].values[:] / 100) # Pa -> hPa + surface_pressure = (blended_data["surface_pressure"].values[:] / 100) # Pa -> hPa + n = len(dat["CH4"]) + pressures = np.full([n, 12 + 1], np.nan, dtype=np.float32) + for i in range(12 + 1): + pressures[:, i] = surface_pressure - i * pressure_interval + dat["pressures"] = pressures + + # Add an axis here to mimic the (scanline, groundpixel) format of operational TROPOMI data + # This is so the blended data will be compatible with the TROPOMI operators + for key in dat.keys(): + dat[key] = np.expand_dims(dat[key], axis=0) + + except Exception as e: + print(f"Error opening {filename}: {e}") + return None + + return dat + + +def read_and_filter_satellite( + filename, + satellite_str, + gc_startdate, + gc_enddate, + xlim, + ylim, + use_water_obs +): + # Read TROPOMI data + assert satellite_str in ["BlendedTROPOMI", "TROPOMI", "Other"], "satellite_str is not one of BlendedTROPOMI, TROPOMI, or Other" + if satellite_str == "BlendedTROPOMI": + satellite = read_blended(filename) + elif satellite_str == "TROPOMI": + satellite = read_tropomi(filename) + else: + print("Other data source is not currently supported --HON") + sys.exit(1) + + # If empty, skip this file + if satellite == None: + print(f"Skipping {filename} due to file processing issue.") + return satellite + + # Filter the data + if satellite_str == "BlendedTROPOMI": + # Only going to consider blended data within lat/lon/time bounds and wihtout problematic coastal pixels + sat_ind = filter_blended(satellite, xlim, ylim, + gc_startdate, gc_enddate, use_water_obs) + elif satellite_str == "TROPOMI": + # Only going to consider TROPOMI data within lat/lon/time bounds and with QA > 0.5 + sat_ind = filter_tropomi(satellite, xlim, ylim, + gc_startdate, gc_enddate, use_water_obs) + else: + print("Other data source filtering is not currently supported --HON") + sys.exit(1) + + return satellite, sat_ind -def get_posterior_emissions(prior, scale): +def get_posterior_emissions(prior, scale, species): """ Function to calculate the posterior emissions from the prior and the scale factors. Properly accounting for no optimization @@ -465,11 +674,11 @@ def get_posterior_emissions(prior, scale): # To do this, we: # make a copy of the original soil sink - prior_soil_sink = prior["EmisCH4_SoilAbsorb"].copy() - + prior_soil_sink = prior[f"Emis{species}_SoilAbsorb"].copy() + # remove the soil sink from the prior total before applying scale factors - prior["EmisCH4_Total"] = prior["EmisCH4_Total"] - prior_soil_sink - + prior[f"Emis{species}_Total"] = prior[f"Emis{species}_Total"] - prior_soil_sink + # scale the prior emissions for all sectors using the scale factors posterior = prior.copy() for ds_var in list(prior.keys()): @@ -477,10 +686,10 @@ def get_posterior_emissions(prior, scale): posterior[ds_var] = prior[ds_var] * scale["ScaleFactor"] # But reset the soil sink to the original value - posterior["EmisCH4_SoilAbsorb"] = prior_soil_sink - + posterior[f"Emis{species}_SoilAbsorb"] = prior_soil_sink + # Add the original soil sink back to the total emissions - posterior["EmisCH4_Total"] = posterior["EmisCH4_Total"] + prior_soil_sink + posterior[f"Emis{species}_Total"] = posterior[f"Emis{species}_Total"] + prior_soil_sink return posterior diff --git a/src/notebooks/kf_notebook.ipynb b/src/notebooks/kf_notebook.ipynb index 1d21d941..d28084e1 100644 --- a/src/notebooks/kf_notebook.ipynb +++ b/src/notebooks/kf_notebook.ipynb @@ -53,6 +53,9 @@ "config = yaml.load(\n", " open(\"/home/ubuntu/integrated_methane_inversion/config.yml\"), Loader=yaml.FullLoader\n", ")" + "\n", + "# Save out the species argument\n", + "species = config[\"Species\"]" ] }, { @@ -202,7 +205,7 @@ " get_period_mean_emissions(prior_cache_path, period + 1, \"./../periods.csv\")\n", " for period in range(periods_df.shape[0])\n", "]\n", - "priors = [prior[\"EmisCH4_Total\"] for prior in priors_ds]\n", + "priors = [prior[f\"Emis{species}_Total\"] for prior in priors_ds]\n", "\n", "# Optimized scale factors\n", "scales = [xr.load_dataset(sf_path) for sf_path in sf_paths]\n", @@ -211,7 +214,7 @@ "posteriors_ds = [\n", " get_posterior_emissions(priors_ds[i], scales[i]) for i in range(num_periods)\n", "]\n", - "posteriors = [posterior[\"EmisCH4_Total\"] for posterior in posteriors_ds]" + "posteriors = [posterior[f\"Emis{species}_Total\"] for posterior in posteriors_ds]" ] }, { diff --git a/src/notebooks/visualization_notebook.ipynb b/src/notebooks/visualization_notebook.ipynb index 4c02b408..0e616fc1 100644 --- a/src/notebooks/visualization_notebook.ipynb +++ b/src/notebooks/visualization_notebook.ipynb @@ -73,6 +73,8 @@ "config = yaml.load(\n", " open(\"/home/ubuntu/integrated_methane_inversion/config.yml\"), Loader=yaml.FullLoader\n", ")" + "# Save out the species as its own variable\n", + "species = config[\"Species\"]" ] }, { @@ -218,21 +220,21 @@ "source": [ "# Prior emissions\n", "prior_ds = get_mean_emissions(start_date, end_date, prior_cache)\n", - "prior = prior_ds[\"EmisCH4_Total\"]\n", + "prior = prior_ds[f\"Emis{species}_Total\"]\n", "\n", "if config[\"KalmanMode\"]:\n", " # properly apply nudged sfs to prior in Kalman mode\n", " prior_sf = xr.load_dataset(prior_sf_pth)\n", - " prior_ds = get_posterior_emissions(prior_ds, prior_sf)\n", - " prior = prior_ds[\"EmisCH4_Total\"]\n", + " prior_ds = get_posterior_emissions(prior_ds, prior_sf, species)\n", + " prior = prior_ds[f\"Emis{species}_Total\"]\n", "\n", "# Optimized scale factors\n", "scale_ds = xr.load_dataset(results_pth)\n", "scale = scale_ds[\"ScaleFactor\"]\n", "\n", "# Posterior emissions\n", - "posterior_ds = get_posterior_emissions(prior_ds, scale_ds)\n", - "posterior = posterior_ds[\"EmisCH4_Total\"]" + "posterior_ds = get_posterior_emissions(prior_ds, scale_ds, species)\n", + "posterior = posterior_ds[f\"Emis{species}_Total\"]" ] }, { @@ -344,7 +346,7 @@ " if post_val > 0 or prior_val > 0:\n", " post_sector_vals.append(post_val)\n", " prior_sector_vals.append(prior_val)\n", - " positive_sectors.append(sector.replace(\"EmisCH4_\", \"\"))\n", + " positive_sectors.append(sector.replace(f\"Emis{species}_\", \"\"))\n", "\n", "# Combine the lists into tuples and sort them based on post_sector_vals\n", "combined = list(zip(positive_sectors, prior_sector_vals, post_sector_vals))\n", @@ -477,7 +479,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Open TROPOMI and GEOS-Chem columns" + "## Open satellite and GEOS-Chem columns" ] }, { @@ -486,12 +488,12 @@ "metadata": {}, "outputs": [], "source": [ - "# Get observed and GEOS-Chem-simulated TROPOMI columns\n", + "# Get observed and GEOS-Chem-simulated satellite columns\n", "def aggregate_data(data_dir, data_posterior):\n", " files = np.sort(os.listdir(data_dir))\n", " lat = np.array([])\n", " lon = np.array([])\n", - " tropomi = np.array([])\n", + " satellite = np.array([])\n", " geos_prior = np.array([])\n", " geos_posterior = np.array([])\n", " observation_count = np.array([])\n", @@ -500,13 +502,13 @@ " # Get paths\n", " pth = os.path.join(data_dir, f)\n", " pth_posterior = os.path.join(data_posterior, f)\n", - " # Load TROPOMI/GEOS-Chem and Jacobian matrix data from the .pkl file\n", + " # Load satellite/GEOS-Chem and Jacobian matrix data from the .pkl file\n", " obj = load_obj(pth)\n", " obj_posterior = load_obj(pth_posterior)\n", - " # If there aren't any TROPOMI observations on this day, skip\n", + " # If there aren't any satellite observations on this day, skip\n", " if obj[\"obs_GC\"].shape[0] == 0:\n", " continue\n", - " # Otherwise, grab the TROPOMI/GEOS-Chem data\n", + " # Otherwise, grab the satellite/GEOS-Chem data\n", " obs_GC = obj[\"obs_GC\"]\n", " obs_GC_posterior = obj_posterior[\"obs_GC\"]\n", " # Only consider data within latitude and longitude bounds\n", @@ -518,12 +520,12 @@ " )\n", " if len(ind[0]) == 0: # Skip if no data in bounds\n", " continue\n", - " obs_GC = obs_GC[ind[0], :] # TROPOMI and GEOS-Chem data within bounds\n", + " obs_GC = obs_GC[ind[0], :] # satellite and GEOS-Chem data within bounds\n", " obs_GC_posterior = obs_GC_posterior[ind[0], :]\n", - " # Record lat, lon, tropomi ch4, and geos ch4\n", + " # Record lat, lon, satellite mixing ratio, and geos mixing ratio\n", " lat = np.concatenate((lat, obs_GC[:, 3]))\n", " lon = np.concatenate((lon, obs_GC[:, 2]))\n", - " tropomi = np.concatenate((tropomi, obs_GC[:, 0]))\n", + " satellite = np.concatenate((satellite, obs_GC[:, 0]))\n", " geos_prior = np.concatenate((geos_prior, obs_GC[:, 1]))\n", " observation_count = np.concatenate((observation_count, obs_GC[:, 4]))\n", " geos_posterior = np.concatenate((geos_posterior, obs_GC_posterior[:, 1]))\n", @@ -531,11 +533,11 @@ " df = pd.DataFrame()\n", " df[\"lat\"] = lat\n", " df[\"lon\"] = lon\n", - " df[\"tropomi\"] = tropomi\n", + " df[\"satellite\"] = satellite\n", " df[\"geos_prior\"] = geos_prior\n", " df[\"geos_posterior\"] = geos_posterior\n", - " df[\"diff_tropomi_prior\"] = geos_prior - tropomi\n", - " df[\"diff_tropomi_posterior\"] = geos_posterior - tropomi\n", + " df[\"diff_satellite_prior\"] = geos_prior - satellite\n", + " df[\"diff_satellite_posterior\"] = geos_posterior - satellite\n", " df[\"observation_count\"] = observation_count\n", "\n", " return df\n", @@ -543,10 +545,10 @@ "\n", "superobs_df = aggregate_data(satdat_dir, posterior_dir)\n", "visualization_df = aggregate_data(visualization_dir, posterior_viz_dir)\n", - "n_obs = len(superobs_df[\"tropomi\"])\n", + "n_obs = len(superobs_df[\"satellite\"])\n", "\n", "print(\n", - " f'Found {n_obs} super-observations in the domain, representing {np.sum(superobs_df[\"observation_count\"]).round(0)} TROPOMI observations.'\n", + " f'Found {n_obs} super-observations in the domain, representing {np.sum(superobs_df[\"observation_count\"]).round(0)} satellite observations.'\n", ")\n", "superobs_df.head()" ] @@ -565,17 +567,17 @@ "outputs": [], "source": [ "# calculate some statistics for the prior\n", - "prior_std = np.round(np.std(superobs_df[\"diff_tropomi_prior\"]), 2)\n", + "prior_std = np.round(np.std(superobs_df[\"diff_satellite_prior\"]), 2)\n", "prior_bias = np.round(\n", " np.average(\n", - " superobs_df[\"diff_tropomi_prior\"], weights=superobs_df[\"observation_count\"]\n", + " superobs_df[\"diff_satellite_prior\"], weights=superobs_df[\"observation_count\"]\n", " ),\n", " 2,\n", ")\n", "prior_RMSE = np.round(\n", " np.sqrt(\n", " np.average(\n", - " superobs_df[\"diff_tropomi_prior\"] ** 2,\n", + " superobs_df[\"diff_satellite_prior\"] ** 2,\n", " weights=superobs_df[\"observation_count\"],\n", " )\n", " ),\n", @@ -583,17 +585,17 @@ ")\n", "\n", "# and the posterior\n", - "posterior_std = np.round(np.std(superobs_df[\"diff_tropomi_posterior\"]), 2)\n", + "posterior_std = np.round(np.std(superobs_df[\"diff_satellite_posterior\"]), 2)\n", "posterior_bias = np.round(\n", " np.average(\n", - " superobs_df[\"diff_tropomi_posterior\"], weights=superobs_df[\"observation_count\"]\n", + " superobs_df[\"diff_satellite_posterior\"], weights=superobs_df[\"observation_count\"]\n", " ),\n", " 2,\n", ")\n", "posterior_RMSE = np.round(\n", " np.sqrt(\n", " np.average(\n", - " superobs_df[\"diff_tropomi_posterior\"] ** 2,\n", + " superobs_df[\"diff_satellite_posterior\"] ** 2,\n", " weights=superobs_df[\"observation_count\"],\n", " )\n", " ),\n", @@ -627,7 +629,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Compare TROPOMI and GEOS-Chem columns" + "## Compare satellite and GEOS-Chem columns" ] }, { @@ -643,7 +645,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Simple averaging scheme to grid the XCH4 data at 0.1 x 0.1 resolution\n", + "# Simple averaging scheme to grid the mixing ratio data at 0.1 x 0.1 resolution\n", "df_copy = visualization_df.copy() # save for later\n", "visualization_df[\"lat\"] = np.round(visualization_df[\"lat\"], 1)\n", "visualization_df[\"lon\"] = np.round(visualization_df[\"lon\"], 1)\n", @@ -663,21 +665,21 @@ "metadata": {}, "outputs": [], "source": [ - "# Mean TROPOMI XCH4 columns on 0.1 x 0.1 grid\n", + "# Mean satellite mixing ratio columns on 0.1 x 0.1 grid\n", "fig = plt.figure(figsize=(8, 8))\n", "ax = fig.subplots(1, 1, subplot_kw={\"projection\": ccrs.PlateCarree()})\n", "\n", "xch4_min, xch4_max = dynamic_range(ds[\"tropomi\"].values)\n", "plot_field(\n", " ax,\n", - " ds[\"tropomi\"],\n", + " ds[\"satellite\"],\n", " cmap=\"Spectral_r\",\n", " vmin=xch4_min,\n", " vmax=xch4_max,\n", " lon_bounds=lon_bounds,\n", " lat_bounds=lat_bounds,\n", - " title=\"TROPOMI $X_{CH4}$\",\n", - " clean_title=\"TROPOMI XCH4\",\n", + " title=f\"Satellite $X_{species}$\",\n", + " clean_title=f\"Satellite X{species}\",\n", " cbar_label=\"Column mixing ratio (ppb)\",\n", " mask=mask,\n", " only_ROI=False,\n", @@ -692,7 +694,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Mean prior and posterior GEOS-Chem XCH4 columns on 0.1 x 0.1 grid\n", + "# Mean prior and posterior GEOS-Chem mixing ratio columns on 0.1 x 0.1 grid\n", "fig = plt.figure(figsize=(25, 8))\n", "ax1, ax2 = fig.subplots(1, 2, subplot_kw={\"projection\": ccrs.PlateCarree()})\n", "\n", @@ -704,8 +706,8 @@ " vmax=xch4_max,\n", " lon_bounds=lon_bounds,\n", " lat_bounds=lat_bounds,\n", - " title=\"GEOS-Chem $X_{CH4}$ (prior simulation)\",\n", - " clean_title=\"GEOS-Chem XCH4 (prior)\",\n", + " title=f\"GEOS-Chem $X_{species}$ (prior simulation)\",\n", + " clean_title=f\"GEOS-Chem X{species} (prior)\",\n", " cbar_label=\"Dry column mixing ratio (ppb)\",\n", " mask=mask,\n", " only_ROI=False,\n", @@ -720,8 +722,8 @@ " vmax=xch4_max,\n", " lon_bounds=lon_bounds,\n", " lat_bounds=lat_bounds,\n", - " title=\"GEOS-Chem $X_{CH4}$ (posterior simulation)\",\n", - " clean_title=\"GEOS-Chem XCH4 (Prior&Posterior)\",\n", + " title=f\"GEOS-Chem $X_{species}$ (posterior simulation)\",\n", + " clean_title=f\"GEOS-Chem X{species} (Prior&Posterior)\",\n", " cbar_label=\"Dry column mixing ratio (ppb)\",\n", " mask=mask,\n", " only_ROI=False,\n", @@ -736,20 +738,20 @@ "metadata": {}, "outputs": [], "source": [ - "# Plot differences between GEOS-Chem and TROPOMI XCH4\n", + "# Plot differences between GEOS-Chem and satellite mixing ratios\n", "fig = plt.figure(figsize=(25, 8))\n", "ax1, ax2 = fig.subplots(1, 2, subplot_kw={\"projection\": ccrs.PlateCarree()})\n", "\n", "plot_field(\n", " ax1,\n", - " ds[\"diff_tropomi_prior\"],\n", + " ds[\"diff_satellite_prior\"],\n", " cmap=\"RdBu_r\",\n", " vmin=-40,\n", " vmax=40,\n", " lon_bounds=lon_bounds,\n", " lat_bounds=lat_bounds,\n", - " title=\"Prior $-$ TROPOMI\",\n", - " clean_title=\"Prior-TROPOMI\",\n", + " title=\"Prior $-$ satellite\",\n", + " clean_title=\"Prior-satellite\",\n", " cbar_label=\"ppb\",\n", " mask=mask,\n", " only_ROI=False,\n", @@ -758,14 +760,14 @@ "\n", "plot_field(\n", " ax2,\n", - " ds[\"diff_tropomi_posterior\"],\n", + " ds[\"diff_satellite_posterior\"],\n", " cmap=\"RdBu_r\",\n", " vmin=-40,\n", " vmax=40,\n", " lon_bounds=lon_bounds,\n", " lat_bounds=lat_bounds,\n", - " title=\"Posterior $-$ TROPOMI\",\n", - " clean_title=\"(Prior&Posterior)-TROPOMI\",\n", + " title=\"Posterior $-$ satellite\",\n", + " clean_title=\"(Prior&Posterior)-satellite\",\n", " cbar_label=\"ppb\",\n", " mask=mask,\n", " only_ROI=False,\n", @@ -780,7 +782,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Plot differences between posterior and prior simulated XCH4\n", + "# Plot differences between posterior and prior simulated mixing ratios\n", "fig = plt.figure(figsize=(8, 8))\n", "ax = fig.subplots(1, 1, subplot_kw={\"projection\": ccrs.PlateCarree()})\n", "\n", @@ -794,8 +796,8 @@ " vmax=np.nanmax(diff),\n", " lon_bounds=lon_bounds,\n", " lat_bounds=lat_bounds,\n", - " title=\"$\\Delta X_{CH4}$ (Posterior $-$ Prior)\",\n", - " clean_title=\"Delta XCH4 (Posterior-Prior)\",\n", + " title=f\"$\\Delta X_{{species}}$ (Posterior $-$ Prior)\",\n", + " clean_title=f\"Delta X{species} (Posterior-Prior)\",\n", " cbar_label=\"ppb\",\n", " mask=mask,\n", " only_ROI=False,\n", @@ -817,7 +819,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Simple averaging scheme to grid the XCH4 data at 0.1 x 0.1 resolution\n", + "# Simple averaging scheme to grid the mixing ratio data at 0.1 x 0.1 resolution\n", "df_copy = superobs_df.copy() # save for later\n", "superobs_df[\"lat\"] = np.round(superobs_df[\"lat\"], 1)\n", "superobs_df[\"lon\"] = np.round(superobs_df[\"lon\"], 1)\n", @@ -867,7 +869,7 @@ "lon_b = np.arange(ds[\"lon\"][0] - 0.05, ds[\"lon\"][-1] + 0.1, 0.1)\n", "ds = ds.assign_coords(lon_b=(\"lon_b\", lon_b))\n", "ds = ds.assign_coords(lat_b=(\"lat_b\", lat_b))\n", - "ds[\"mask\"] = xr.where(~np.isnan(ds[\"tropomi\"]), 1, 0)" + "ds[\"mask\"] = xr.where(~np.isnan(ds[\"satellite\"]), 1, 0)" ] }, { @@ -919,20 +921,20 @@ "metadata": {}, "outputs": [], "source": [ - "# Re-plot differences between GEOS-Chem and TROPOMI XCH4\n", + "# Re-plot differences between GEOS-Chem and satellite mixing ratios\n", "fig = plt.figure(figsize=(25, 8))\n", "ax1, ax2 = fig.subplots(1, 2, subplot_kw={\"projection\": ccrs.PlateCarree()})\n", "\n", "plot_field(\n", " ax1,\n", - " ds_regrid[\"diff_tropomi_prior\"],\n", + " ds_regrid[\"diff_satellite_prior\"],\n", " cmap=\"RdBu_r\",\n", " vmin=-25,\n", " vmax=25,\n", " lon_bounds=lon_bounds,\n", " lat_bounds=lat_bounds,\n", - " title=\"Prior $-$ TROPOMI\",\n", - " clean_title=\"Prior-TROPOMI (regridded)\",\n", + " title=\"Prior $-$ satellite\",\n", + " clean_title=\"Prior-satellite (regridded)\",\n", " cbar_label=\"ppb\",\n", " mask=mask,\n", " only_ROI=False,\n", @@ -941,14 +943,14 @@ "\n", "plot_field(\n", " ax2,\n", - " ds_regrid[\"diff_tropomi_posterior\"],\n", + " ds_regrid[\"diff_satellite_posterior\"],\n", " cmap=\"RdBu_r\",\n", " vmin=-25,\n", " vmax=25,\n", " lon_bounds=lon_bounds,\n", " lat_bounds=lat_bounds,\n", - " title=\"Posterior $-$ TROPOMI\",\n", - " clean_title=\"(Prior&Posterior)-TROPOMI (regridded)\",\n", + " title=\"Posterior $-$ satellite\",\n", + " clean_title=\"(Prior&Posterior)-satellite (regridded)\",\n", " cbar_label=\"ppb\",\n", " mask=mask,\n", " only_ROI=False,\n", @@ -963,7 +965,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Re-plot differences between posterior and prior simulated XCH4\n", + "# Re-plot differences between posterior and prior simulated mixing ratios\n", "fig = plt.figure(figsize=(8, 8))\n", "ax = fig.subplots(1, 1, subplot_kw={\"projection\": ccrs.PlateCarree()})\n", "\n", diff --git a/src/utilities/common.sh b/src/utilities/common.sh index 07494709..71a40083 100644 --- a/src/utilities/common.sh +++ b/src/utilities/common.sh @@ -2,12 +2,118 @@ # Common shell function for the IMI # Functions available in this file include: +# - submit_job +# - submit_slurm_job +# - submit_pbs_job # - print_stats # - imi_failed # - ncmax # - ncmin -# Description: +# Description: +# Submit a job with default ICI settings using either SBATCH or PBS +# Usage: +# submit_job $SchedulerType $SaveOutput $JobArguments +submit_job() { + if [[ $1 = "slurm" || $1 = "tmux" ]]; then + submit_slurm_job "${@:2}" + elif [[ $1 = "PBS" ]]; then + submit_pbs_job "${@:2}" + else + echo "Scheduler type $1 not recognized." + fi + + # If output was saved, concatenate it to imi_output + if [[ $2 = "true" ]]; then + cat imi_output.tmp >> ${InversionPath}/imi_output.log + rm imi_output.tmp + fi +} + +# Description: +# Submit a job with default ICI settings using SBATCH +# Usage: +# submit_slurm_job $JobArguments +submit_slurm_job() { + if [[ $1 = "true" ]]; then + sbatch -N 1 \ + --mem $RequestedMemory \ + -c $RequestedCPUs \ + -t $RequestedTime \ + -p $SchedulerPartition \ + -o imi_output.tmp \ + -W ${@:2}; wait; + else + sbatch -N 1 \ + --mem $RequestedMemory \ + -c $RequestedCPUs \ + -t $RequestedTime \ + -p $SchedulerPartition \ + -o imi_output.tmp \ + -W ${@:2}; wait; + fi +} + +# Description: +# Submit a job with default ICI settings using PBS +# Usage: +# submit_pbs_job $JobArguments +submit_pbs_job() { + # If save output + if [[ $1 = "true" ]]; then + qsub -lselect=1:ncpus=$RequestedCPUs:mem=$RequestedMemory:model=ivy \ + -l walltime=$RequestedTime -q devel -o imi_output.tmp \ + -Wblock=true -- ${@:2}; wait; + else + qsub -lselect=1:ncpus=$RequestedCPUs:mem=$RequestedMemory:model=ivy \ + -l walltime=$RequestedTime -q devel \ + -Wblock=true -- ${@:2}; wait; + fi +} + +convert_sbatch_to_pbs() { + DataPaths=($OutputPath $DataPath $DataPathObs $HOME) + declare -a SitesNeeded=() + for DP in ${DataPaths[@]}; do + SitesNeeded_DP=$( find $DP/ -type l -exec realpath {} \; | cut -d/ -f2 | sort -u ) + for NS in ${SitesNeeded_DP[*]}; do + if ! [[ ${SitesNeeded[@]} =~ $NS ]]; then + SitesNeeded+=("${NS}+") + fi + done + done + SitesNeeded=$(IFS=/ ; echo "${SitesNeeded[*]}") + SitesNeeded="/${SitesNeeded::-1}" + + # Get files containing SBATCH + current_dir=$(pwd) + sbatch_files=($(grep -rl "SBATCH" . --exclude-dir={"GCClassic",".git","*utilities*"})) + echo "Replacing SBATCH with PBS in the following files:" + for file in ${sbatch_files[@]}; do + f=${current_dir}${file:1} + echo " ${f}" + + # First, insert needed sites at the top of every file + if grep -q "PBS -l site=needed" $file; then + awk -i inplace 'FNR==NR{ if (/^##SBATCH/) p=NR; next} 1; FNR==p{ print "##PBS -l site=needed='${SitesNeeded}'" }' ${f} ${f} + awk -i inplace 'FNR==NR{ if (/^#SBATCH/) p=NR; next} 1; FNR==p{ print "#PBS -l site=needed='${SitesNeeded}'" }' ${f} ${f} + fi + + # Replace SBATCH options + sed -i -e "s/SBATCH -J /PBS -N /g" \ + -e "s/SBATCH -N /PBS -l nodes=/g" \ + -e "s/SBATCH -c /PBS -l ncpus=/g" \ + -e "s/SBATCH --mem /PBS -l mem=/g" \ + -e "s/SBATCH -t /PBS -l walltime=/g" \ + -e "s/SBATCH -n /PBS -l nodes=1:ppn=/g" \ + -e "s/SBATCH --ntasks-per-node/PBS -l nodes=1:ppn/g" \ + -e "s/SBATCH -p /PBS -q /g" \ + -e "s/SBATCH -o /PBS -o /g" \ + -e "s/SBATCH --mail-type=END/PBS -m e/g" ${f} + done +} + +# Description: # Print runtime stats based on existing variables # Usage: # print_stats diff --git a/src/utilities/crop_met.sh b/src/utilities/crop_met.sh index fbb4cfae..76ffc943 100755 --- a/src/utilities/crop_met.sh +++ b/src/utilities/crop_met.sh @@ -8,10 +8,9 @@ ############################################################################## # Custom to Harvard FAS RC cluster: #SBATCH -n 1 -#SBATCH -N 1 #SBATCH -t 0-6:00 #SBATCH -p huce_cascade -#SBATCH --mem=2000 +#SBATCH --mem=2gb #SBATCH --mail-type=END # Load modules for CDO diff --git a/src/utilities/sanitize_input_yaml.py b/src/utilities/sanitize_input_yaml.py index aaaf9376..a60895f2 100644 --- a/src/utilities/sanitize_input_yaml.py +++ b/src/utilities/sanitize_input_yaml.py @@ -14,21 +14,22 @@ # variables only required by local cluster config_required_local_cluster = [ - "DataPathTROPOMI", + "DataPathObs", "GEOSChemEnv", ] # variables required on all systems config_required = [ "RunName", + "Species", "isAWS", - "UseSlurm", + "SchedulerType", "SafeMode", "S3Upload", "StartDate", "EndDate", "SpinupMonths", - "BlendedTROPOMI", + "SatelliteProduct", "isRegional", "RegionID", "LonMin", @@ -71,15 +72,15 @@ "MaxSimultaneousRuns", "NumJacobianTracers", "PerturbValue", - "HourlyCH4", + "HourlySpecies", "PLANEFLIGHT", "GOSAT", "TCCON", "AIRS", + "UseBCsForRestart", "OutputPath", "DataPath", - "CondaEnv", - "CondaFile", + "PythonEnv", "RestartDownload", "RestartFilePrefix", "BCpath", @@ -145,10 +146,10 @@ def raise_error_message(var): elif config[key]: config_required = config_required + conditional_dict[key] - # update required vars based on system - if config["isAWS"]: - required_vars = config_required + config_required_aws - else: + # # update required vars based on system + # if config["isAWS"]: + # required_vars = config_required + config_required_aws + if not config["isAWS"]: required_vars = config_required + config_required_local_cluster missing_input_vars = [x for x in required_vars if x not in inputted_config] diff --git a/src/write_BCs/README.md b/src/write_BCs/README.md index b34f401f..a2ca5693 100644 --- a/src/write_BCs/README.md +++ b/src/write_BCs/README.md @@ -8,6 +8,7 @@ - `blendedDir` - where Blended TROPOMI+GOSAT files are located. - `CondaEnv` - conda environment to use for the Python script. - `GEOSChemEnv` - environment file for GEOS-Chem. + - `PythonEnv` - environment file for Python. - `Partition` - which partition to run the jobs on. - `restartFilePath` - restart file for GEOS-Chem. - if your simulation starts on 1 April 2018, this won't be used (`GEOSChem.Restart.20180401_0000z.nc4` will). diff --git a/src/write_BCs/run_boundary_conditions.sh b/src/write_BCs/run_boundary_conditions.sh index 70f45fc9..3a6eea2a 100644 --- a/src/write_BCs/run_boundary_conditions.sh +++ b/src/write_BCs/run_boundary_conditions.sh @@ -1,8 +1,8 @@ #!/bin/bash -#SBATCH --job-name=boundary_conditions -#SBATCH --mem=4000 -#SBATCH --time=07-00:00 -#SBATCH --output=debug.log +#SBATCH -J boundary_conditions +#SBATCH --mem 4gb +#SBATCH -t 07-00:00 +#SBATCH -o debug.log cwd="$(pwd)" @@ -109,17 +109,30 @@ fi cp runScriptSamples/operational_examples/harvard_cannon/geoschem.run . sed -i -e "s|sapphire,huce_cascade,seas_compute,shared|${partition}|g" \ -e "s|--mem=15000|--mem=128000|g" \ - -e "s|-t 0-12:00|-t 07-00:00|g" \ - -e "s|-c 8|-c 48|g" geoschem.run -sbatch -W geoschem.run -wait + -e "s|-t 0-12:00|-t 07-00:00|g"\ + -e "s|-c 8|-c 24|g" geoschem.run +if [[ $SchedulerType = "slurm" ]]; then + sbatch -W geoschem.run; wait; +elif [[ $SchedulerType = "PBS" ]]; then + qsub -sync y geoschem.run; wait; +else + echo "Scheduler type $SchedulerType not recognized." +fi # Write the boundary conditions using write_boundary_conditions.py cd "${cwd}" -sbatch -W -J blended -o boundary_conditions.log --open-mode=append -p ${partition} -t 7-00:00 --mem 96000 -c 40 --wrap "source $condaFile; conda activate $condaEnv; python write_boundary_conditions.py True $blendedDir $gcStartDate $gcEndDate" -wait # run for Blended TROPOMI+GOSAT -sbatch -W -J tropomi -o boundary_conditions.log --open-mode=append -p ${partition} -t 7-00:00 --mem 96000 -c 40 --wrap "source $condaFile; conda activate $condaEnv; python write_boundary_conditions.py False $tropomiDir $gcStartDate $gcEndDate" -wait # run for TROPOMI data -echo "" >>"${cwd}/boundary_conditions.log" -echo "Blended TROPOMI+GOSAT boundary conditions --> ${workDir}/blended-boundary-conditions" >>"${cwd}/boundary_conditions.log" -echo "TROPOMI boundary conditions --> ${workDir}/tropomi-boundary-conditions" >>"${cwd}/boundary_conditions.log" +if [[ $SchedulerType = "slurm" | $SchedulerType = "tmux" ]]; then + sbatch -W -J blended -o boundary_conditions.log --open-mode=append -p ${partition} -t 7-00:00 --mem 96000 -c 40 --wrap "source ~/.bashrc; source $PythonEnv; python write_boundary_conditions.py "BlendedTROPOMI" $blendedDir $Species $gcStartDate $gcEndDate" + wait # run for Blended TROPOMI+GOSAT + sbatch -W -J tropomi -o boundary_conditions.log --open-mode=append -p ${partition} -t 7-00:00 --mem 96000 -c 40 --wrap "source ~/.bashrc; source $PythonEnv; python write_boundary_conditions.py "TROPOMI" $tropomiDir $Species $gcStartDate $gcEndDate" + wait # run for TROPOMI data +elif [[ $SchedulerType = "PBS" ]]; then + qsub -sync y -N blended -o boundary_conditions_blended.log -l select=mem=96G:ncpus=40:model=ivy,walltime=07:00:00 -- /usr/bin/bash -c "source ~/.bashrc; source $PythonEnv; python write_boundary_conditions.py "BlendedTROPOMI" $blendedDir $Species $gcStartDate $gcEndDate" + wait # run for Blended TROPOMI+GOSAT + qsub -sync y -N blended -o boundary_conditions_operational.log -l select=mem=96G:ncpus=40:model=ivy,walltime=07:00:00 -- /usr/bin/bash -c "source ~/.bashrc; source $PythonEnv; python write_boundary_conditions.py "TROPOMI" $tropomiDir $Species $gcStartDate $gcEndDate" + wait # run for TROPOMI data +fi + +echo "" >> "${cwd}/boundary_conditions.log" +echo "Blended TROPOMI+GOSAT boundary conditions --> ${workDir}/blended-boundary-conditions" >> "${cwd}/boundary_conditions.log" +echo "TROPOMI boundary conditions --> ${workDir}/tropomi-boundary-conditions" >> "${cwd}/boundary_conditions.log" diff --git a/src/write_BCs/write_boundary_conditions.py b/src/write_BCs/write_boundary_conditions.py index 9fdf79ee..2589b4d7 100644 --- a/src/write_BCs/write_boundary_conditions.py +++ b/src/write_BCs/write_boundary_conditions.py @@ -16,80 +16,73 @@ sys.path.insert(0, "../../") from src.inversion_scripts.operators.operator_utilities import nearest_loc -from src.inversion_scripts.operators.TROPOMI_operator import apply_tropomi_operator -from src.inversion_scripts.utils import save_obj, load_obj +from src.inversion_scripts.operators.satellite_operator import apply_satellite_operator +from src.inversion_scripts.utils import mixing_ratio_conv_factor - -def get_TROPOMI_times(filename): +def get_satellite_times(filename): """ - Function that parses the TROPOMI filenames to get the start and end times. + Function that parses the satellite filenames to get the start and end times. Example input (str): S5P_RPRO_L2__CH4____20220725T152751_20220725T170921_24775_03_020400_20230201T100624.nc Example output (tuple): (np.datetime64('2022-07-25T15:27:51'), np.datetime64('2022-07-25T17:09:21')) """ - file_times = re.search(r"(\d{8}T\d{6})_(\d{8}T\d{6})", filename) - assert ( - file_times is not None - ), "check TROPOMI filename - wasn't able to find start and end times in the filename" - start_TROPOMI_time = np.datetime64( - datetime.datetime.strptime(file_times.group(1), "%Y%m%dT%H%M%S") - ) - end_TROPOMI_time = np.datetime64( - datetime.datetime.strptime(file_times.group(2), "%Y%m%dT%H%M%S") - ) - - return start_TROPOMI_time, end_TROPOMI_time + file_times = re.search(r'(\d{8}T\d{6})_(\d{8}T\d{6})', filename) + assert file_times is not None, "check satellite filename - wasn't able to find start and end times in the filename" + start_satellite_time = np.datetime64(datetime.datetime.strptime(file_times.group(1), "%Y%m%dT%H%M%S")) + end_satellite_time = np.datetime64(datetime.datetime.strptime(file_times.group(2), "%Y%m%dT%H%M%S")) + + return start_satellite_time, end_satellite_time - -def apply_tropomi_operator_to_one_tropomi_file(filename): +def apply_satellite_operator_to_one_satellite_file(filename, satellite_product, species): + """ - Run apply_tropomi_operator from src/inversion_scripts/operators/TROPOMI_operator.py for a single TROPOMI file + Run apply_satellite_operator from src/inversion_scripts/operators/satellite_operator.py for a single satellite file Example input (str): S5P_RPRO_L2__CH4____20220725T152751_20220725T170921_24775_03_020400_20230201T100624.nc """ - - result = apply_tropomi_operator( + + result = apply_satellite_operator( filename=filename, - BlendedTROPOMI=blendedTROPOMI, - n_elements=False, # Not relevant + species=species, + satellite_product=satellite_product, + n_elements=False, # Not relevant gc_startdate=start_time_of_interest, gc_enddate=end_time_of_interest, xlim=[-180, 180], ylim=[-90, 90], gc_cache=os.path.join(config["workDir"], "gc_run", "OutputDir"), - build_jacobian=False, # Not relevant - period_i=False, # Not relevant - config=False, - ) # Not relevant - - return result["obs_GC"], filename - + build_jacobian=False, # Not relevant + sensi_cache=False) # Not relevant + + return result["obs_GC"],filename -def create_daily_means(satelliteDir, start_time_of_interest, end_time_of_interest): +def create_daily_means(satelliteDir, satellite_product, species, start_time_of_interest, end_time_of_interest): - # List of all TROPOMI files that interesct our time period of interest - TROPOMI_files = sorted( + # List of all satellite files that interesct our time period of interest + satellite_files = sorted( [ - file - for file in glob.glob(os.path.join(satelliteDir, "*.nc")) + file for file in glob.glob(os.path.join(satelliteDir, "*.nc")) if ( - start_time_of_interest - <= get_TROPOMI_times(file)[0] + start_time_of_interest + <= get_satellite_times(file)[0] <= end_time_of_interest ) or ( - start_time_of_interest - <= get_TROPOMI_times(file)[1] + start_time_of_interest + <= get_satellite_times(file)[1] <= end_time_of_interest ) ] ) - print(f"First TROPOMI file -> {TROPOMI_files[0]}") - print(f"Last TROPOMI file -> {TROPOMI_files[-1]}") + print(f"First satellite file -> {satellite_files[0]}") + print(f"Last satellite file -> {satellite_files[-1]}") - # Using as many cores as you have, apply the TROPOMI operator to each file + # Using as many cores as you have, apply the satellite operator to each file obsGC_and_filenames = Parallel(n_jobs=-1)( - delayed(apply_tropomi_operator_to_one_tropomi_file)(filename) - for filename in TROPOMI_files + delayed(apply_satellite_operator_to_one_satellite_file) + ( + filename, satellite_product, species + ) + for filename in satellite_files ) # Read any of the GEOS-Chem files to get the lat/lon grid @@ -110,60 +103,61 @@ def create_daily_means(satelliteDir, start_time_of_interest, end_time_of_interes alldates = [day.astype(datetime.datetime).strftime("%Y%m%d") for day in alldates] # Initialize arrays for regridding - daily_TROPOMI = np.zeros((len(LON), len(LAT), len(alldates))) + daily_satellite = np.zeros((len(LON), len(LAT), len(alldates))) daily_GC = np.zeros((len(LON), len(LAT), len(alldates))) daily_count = np.zeros((len(LON), len(LAT), len(alldates))) - # Loop thorugh all of the files which now contain TROPOMI and the corresponding GC XCH4 - for obsGC, filename in obsGC_and_filenames: + # Loop thorugh all of the files which now contain satellite data and the + # corresponding GC mixing ratios + for obsGC,filename in obsGC_and_filenames: NN = obsGC.shape[0] if NN == 0: continue - # For each TROPOMI observation, assign it to a GEOS-Chem grid cell + # For each satellite observation, assign it to a GEOS-Chem grid cell for iNN in range(NN): # Which day are we on (this is not perfect right now because orbits can cross from one day to the next... - # but it is the best we can do right now without changing apply_tropomi_operator) - file_times = re.search(r"(\d{8}T\d{6})_(\d{8}T\d{6})", filename) + # but it is the best we can do right now without changing apply_satellite_operator) + file_times = re.search(r'(\d{8}T\d{6})_(\d{8}T\d{6})', filename) assert ( file_times is not None - ), "check TROPOMI filename - wasn't able to find start and end times in the filename" + ), "check satellite filename - wasn't able to find start and end times in the filename" date = datetime.datetime.strptime( file_times.group(1), "%Y%m%dT%H%M%S" ).strftime("%Y%m%d") time_ind = alldates.index(date) - c_TROPOMI, c_GC, lon0, lat0 = obsGC[iNN, :4] + c_satellite, c_GC, lon0, lat0 = obsGC[iNN, :4] ii = nearest_loc(lon0, LON, tolerance=5) jj = nearest_loc(lat0, LAT, tolerance=4) - daily_TROPOMI[ii, jj, time_ind] += c_TROPOMI + daily_satellite[ii, jj, time_ind] += c_satellite daily_GC[ii, jj, time_ind] += c_GC daily_count[ii, jj, time_ind] += 1 # Normalize by how many observations got assigned to a grid cell to finish the regridding daily_count[daily_count == 0] = np.nan - daily_TROPOMI = daily_TROPOMI / daily_count + daily_satellite = daily_satellite / daily_count daily_GC = daily_GC / daily_count # Change dimensions - regrid_TROPOMI = np.einsum( - "ijl->lji", daily_TROPOMI - ) # (lon, lat, time) -> (time, lat, lon) - regrid_GC = np.einsum("ijl->lji", daily_GC) # (lon, lat, time) -> (time, lat, lon) + regrid_satellite = np.einsum( + "ijl->lji", daily_satellite + ) # (lon, lat, time) -> (time, lat, lon) + regrid_GC = np.einsum("ijl->lji", daily_GC) # (lon, lat, time) -> (time, lat, lon) - # Make a Dataset with variables of (TROPOMI_CH4, GC_CH4) and dims of (lon, lat, time) + # Make a Dataset with variables of (satellite, GC) and dims of (lon, lat, time) daily_means = xr.Dataset( { - "TROPOMI_CH4": xr.DataArray( - data=regrid_TROPOMI, + 'satellite': xr.DataArray( + data=regrid_satellite, dims=["time", "lat", "lon"], - coords={"time": alldates, "lat": LAT, "lon": LON}, + coords={"time": alldates, "lat": LAT, "lon": LON} ), - "GC_CH4": xr.DataArray( + 'GC': xr.DataArray( data=regrid_GC, dims=["time", "lat", "lon"], - coords={"time": alldates, "lat": LAT, "lon": LON}, + coords={"time": alldates, "lat": LAT, "lon": LON} ), } ) @@ -173,7 +167,7 @@ def create_daily_means(satelliteDir, start_time_of_interest, end_time_of_interes def calculate_bias(daily_means): - bias = daily_means["GC_CH4"] - daily_means["TROPOMI_CH4"] + bias = daily_means["GC"] - daily_means["satellite"] # Smooth spatially bias = bias.rolling( @@ -218,9 +212,9 @@ def calculate_bias(daily_means): # Use these values to fill NaNs bias = bias.fillna(nan_value_filler_3d) - print(f"Average bias (GC-TROPOMI): {bias.mean().values:.2f} ppb\n") + print(f"Average bias (GC-satellite): {bias.mean().values:.2f} ppb\n") - # If there are still NaNs (this will happen when TROPOMI data is missing), use 0.0 ppb as the bias but warn the user + # If there are still NaNs (this will happen when satellite data is missing), use 0.0 ppb as the bias but warn the user for t in range(len(bias["time"].values)): if np.any(np.isnan(bias[t, :, :].values)): print(f"WARNING -> using 0.0 ppb as bias for {bias['time'].values[t]}") @@ -228,12 +222,11 @@ def calculate_bias(daily_means): return bias - -def write_bias_corrected_files(bias): +def write_bias_corrected_files(bias, species, satellite_product): # Get dates and convert the total column bias to mol/mol strdate = bias["time"].values - bias_mol_mol = bias.values * 1e-9 + bias_mol_mol = bias.values / mixing_ratio_conv_factor(species) # Only write BCs for our date range files = sorted( @@ -275,51 +268,53 @@ def write_bias_corrected_files(bias): bias_for_this_boundary_condition_file = bias_mol_mol[index, :, :] with xr.open_dataset(filename) as ds: - original_data = ds["SpeciesBC_CH4"].values.copy() + original_data = ds[f"SpeciesBC_{species}"].values.copy() for t in range(original_data.shape[0]): for lev in range(original_data.shape[1]): original_data[t, lev, :, :] -= bias_for_this_boundary_condition_file - ds["SpeciesBC_CH4"].values = original_data - if blendedTROPOMI: + ds[f"SpeciesBC_{species}"].values = original_data + if satellite_product == "BlendedTROPOMI": print( - f"Writing to {os.path.join(config['workDir'], 'blended-boundary-conditions', os.path.basename(filename))}" + f"Writing to {os.path.join(config['workDir'], 'blended-boundary-conditions', os.path.basename(filename))}" ) ds.to_netcdf( - os.path.join( - config["workDir"], - "blended-boundary-conditions", + os.path.join( + config["workDir"], + "blended-boundary-conditions", os.path.basename(filename), ) ) - else: + elif satellite_product == "TROPOMI": print( f"Writing to {os.path.join(config['workDir'], 'tropomi-boundary-conditions', os.path.basename(filename))}" ) ds.to_netcdf( os.path.join( - config["workDir"], - "tropomi-boundary-conditions", - os.path.basename(filename), + config["workDir"], + "tropomi-boundary-conditions", + os.path.basename(filename) ) ) - + else: + print("Other data sources for boundary conditions are not currently supported --HON") if __name__ == "__main__": # Arguments from run_boundary_conditions.sh - blendedTROPOMI = sys.argv[1] == "True" # use blended data? - satelliteDir = sys.argv[2] # where is the satellite data? + satellite_product = sys.argv[1] # use blended data? + satelliteDir = sys.argv[2] # where is the satellite data? + species = sys.argv[3] # Start of GC output (+1 day except 1 Apr 2018 because we ran 1 day extra at the start to account for data not being written at t=0) start_time_of_interest = np.datetime64( - datetime.datetime.strptime(sys.argv[3], "%Y%m%d") + datetime.datetime.strptime(sys.argv[4], "%Y%m%d") ) if start_time_of_interest != np.datetime64("2018-04-01T00:00:00"): start_time_of_interest += np.timedelta64(1, "D") # End of GC output end_time_of_interest = np.datetime64( - datetime.datetime.strptime(sys.argv[4], "%Y%m%d") + datetime.datetime.strptime(sys.argv[5], "%Y%m%d") ) - print(f"\nwrite_boundary_conditions.py output for blendedTROPOMI={blendedTROPOMI}") + print(f"\nwrite_boundary_conditions.py output for {satellite_product}") print(f"Using files at {satelliteDir}") """ @@ -339,7 +334,8 @@ def write_bias_corrected_files(bias): """ daily_means = create_daily_means( - satelliteDir, start_time_of_interest, end_time_of_interest + satelliteDir, satellite_product, species, + start_time_of_interest, end_time_of_interest ) bias = calculate_bias(daily_means) - write_bias_corrected_files(bias) + write_bias_corrected_files(bias, species, satellite_product)