Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/aws-forecast-only' into azure-fo…
Browse files Browse the repository at this point in the history
…recast-only
  • Loading branch information
weihuang-jedi committed Jul 12, 2024
2 parents 3288d9c + 32f13eb commit 5a96c83
Show file tree
Hide file tree
Showing 54 changed files with 385 additions and 599 deletions.
95 changes: 58 additions & 37 deletions ci/Jenkinsfile
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
def Machine = 'none'
def machine = 'none'
def HOME = 'none'
def CUSTOM_WORKSPACE = 'none'
def caseList = ''
// Location of the custom workspaces for each machine in the CI system. They are persitent for each iteration of the PR.
def custom_workspace = [hera: '/scratch1/NCEPDEV/global/CI', orion: '/work2/noaa/stmp/CI/ORION', hercules: '/work2/noaa/stmp/CI/HERCULES']
def NodeName = [hera: 'Hera-EMC', orion: 'Orion-EMC', hercules: 'Hercules-EMC', gaea: 'Gaea']
def custom_workspace = [hera: '/scratch1/NCEPDEV/global/CI', orion: '/work2/noaa/stmp/CI/ORION', hercules: '/work2/noaa/stmp/CI/HERCULES', gaea: '/gpfs/f5/epic/proj-shared/global/CI']
def repo_url = '[email protected]:NOAA-EMC/global-workflow.git'
def STATUS = 'Passed'

Expand Down Expand Up @@ -40,9 +41,9 @@ pipeline {
echo "This is parent job so getting list of nodes matching labels:"
for (label in pullRequest.labels) {
if (label.matches("CI-(.*?)-Ready")) {
def Machine_name = label.split('-')[1].toString()
def machine_name = label.split('-')[1].toString().toLowerCase()
jenkins.model.Jenkins.get().computers.each { c ->
if (c.node.selfLabel.name == "${Machine_name}-EMC") {
if (c.node.selfLabel.name == NodeName[machine_name]) {
run_nodes.add(c.node.selfLabel.name)
}
}
Expand Down Expand Up @@ -70,25 +71,25 @@ pipeline {
}

stage('2. Get Common Workspace') {
agent { label "${machine}-emc" }
agent { label NodeName[machine].toLowerCase() }
steps {
script {
Machine = machine[0].toUpperCase() + machine.substring(1)
echo "Getting Common Workspace for ${Machine}"
ws("${custom_workspace[machine]}/${env.CHANGE_ID}") {
properties([parameters([[$class: 'NodeParameterDefinition', allowedSlaves: ['built-in', 'Hercules-EMC', 'Hera-EMC', 'Orion-EMC'], defaultSlaves: ['built-in'], name: '', nodeEligibility: [$class: 'AllNodeEligibility'], triggerIfResult: 'allCases']])])
HOME = "${WORKSPACE}"
sh(script: "mkdir -p ${HOME}/RUNTESTS;rm -Rf ${HOME}/RUNTESTS/*")
CUSTOM_WORKSPACE = "${WORKSPACE}"
sh(script: "mkdir -p ${CUSTOM_WORKSPACE}/RUNTESTS;rm -Rf ${CUSTOM_WORKSPACE}/RUNTESTS/*")
sh(script: """${GH} pr edit ${env.CHANGE_ID} --repo ${repo_url} --add-label "CI-${Machine}-Building" --remove-label "CI-${Machine}-Ready" """)
}
echo "Building and running on ${Machine} in directory ${HOME}"
echo "Building and running on ${Machine} in directory ${CUSTOM_WORKSPACE}"
}
}
}

stage('3. Build System') {
matrix {
agent { label "${machine}-emc" }
agent { label NodeName[machine].toLowerCase() }
//options {
// throttle(['global_matrix_build'])
//}
Expand All @@ -102,7 +103,7 @@ pipeline {
stage('build system') {
steps {
script {
def HOMEgfs = "${HOME}/${system}" // local HOMEgfs is used to build the system on per system basis under the common workspace HOME
def HOMEgfs = "${CUSTOM_WORKSPACE}/${system}" // local HOMEgfs is used to build the system on per system basis under the custome workspace for each buile system
sh(script: "mkdir -p ${HOMEgfs}")
ws(HOMEgfs) {
if (fileExists("${HOMEgfs}/sorc/BUILT_semaphor")) { // if the system is already built, skip the build in the case of re-runs
Expand All @@ -112,7 +113,16 @@ pipeline {
sh(script: './link_workflow.sh')
}
} else {
checkout scm
try {
echo "Checking out the code for ${system} on ${Machine} using scm in ${HOMEgfs}"
checkout scm
} catch (Exception e) {
if (env.CHANGE_ID) {
sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "Checkout **Failed** on ${Machine}: ${e.getMessage()}" """)
}
echo "Failed to checkout: ${e.getMessage()}"
STATUS = 'Failed'
}
def gist_url = ""
def error_logs = ""
def error_logs_message = ""
Expand Down Expand Up @@ -173,7 +183,7 @@ pipeline {
stage('4. Run Tests') {
failFast false
matrix {
agent { label "${machine}-emc" }
agent { label NodeName[machine].toLowerCase() }
axes {
axis {
name 'Case'
Expand All @@ -189,11 +199,11 @@ pipeline {
}
steps {
script {
sh(script: "sed -n '/{.*}/!p' ${HOME}/gfs/ci/cases/pr/${Case}.yaml > ${HOME}/gfs/ci/cases/pr/${Case}.yaml.tmp")
def yaml_case = readYaml file: "${HOME}/gfs/ci/cases/pr/${Case}.yaml.tmp"
sh(script: "sed -n '/{.*}/!p' ${CUSTOM_WORKSPACE}/gfs/ci/cases/pr/${Case}.yaml > ${CUSTOM_WORKSPACE}/gfs/ci/cases/pr/${Case}.yaml.tmp")
def yaml_case = readYaml file: "${CUSTOM_WORKSPACE}/gfs/ci/cases/pr/${Case}.yaml.tmp"
system = yaml_case.experiment.system
def HOMEgfs = "${HOME}/${system}" // local HOMEgfs is used to populate the XML on per system basis
env.RUNTESTS = "${HOME}/RUNTESTS"
def HOMEgfs = "${CUSTOM_WORKSPACE}/${system}" // local HOMEgfs is used to populate the XML on per system basis
env.RUNTESTS = "${CUSTOM_WORKSPACE}/RUNTESTS"
sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh create_experiment ${HOMEgfs}/ci/cases/pr/${Case}.yaml")
}
}
Expand All @@ -206,27 +216,27 @@ pipeline {
failFast false
steps {
script {
HOMEgfs = "${HOME}/gfs" // common HOMEgfs is used to launch the scripts that run the experiments
def pslot = sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh get_pslot ${HOME}/RUNTESTS ${Case}", returnStdout: true).trim()
def error_file = "${HOME}/RUNTESTS/${pslot}_error.logs"
HOMEgfs = "${CUSTOM_WORKSPACE}/gfs" // common HOMEgfs is used to launch the scripts that run the experiments
def pslot = sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh get_pslot ${CUSTOM_WORKSPACE}/RUNTESTS ${Case}", returnStdout: true).trim()
def error_file = "${CUSTOM_WORKSPACE}/RUNTESTS/${pslot}_error.logs"
sh(script: " rm -f ${error_file}")
try {
sh(script: "${HOMEgfs}/ci/scripts/run-check_ci.sh ${HOME} ${pslot} ${system}")
sh(script: "${HOMEgfs}/ci/scripts/run-check_ci.sh ${CUSTOM_WORKSPACE} ${pslot} ${system}")
} catch (Exception error_experment) {
sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh cancel_batch_jobs ${pslot}")
ws(HOME) {
ws(CUSTOM_WORKSPACE) {
def error_logs = ""
def error_logs_message = ""
if (fileExists(error_file)) {
def fileContent = readFile error_file
def lines = fileContent.readLines()
for (line in lines) {
echo "archiving: ${line}"
if (fileExists("${HOME}/${line}") && readFile("${HOME}/${line}").length() > 0) {
if (fileExists("${CUSTOM_WORKSPACE}/${line}") && readFile("${CUSTOM_WORKSPACE}/${line}").length() > 0) {
try {
archiveArtifacts artifacts: "${line}", fingerprint: true
error_logs = error_logs + "${HOME}/${line} "
error_logs_message = error_logs_message + "${HOME}/${line}\n"
error_logs = error_logs + "${CUSTOM_WORKSPACE}/${line} "
error_logs_message = error_logs_message + "${CUSTOM_WORKSPACE}/${line}\n"
} catch (Exception error_arch) {
echo "Failed to archive error log ${line}: ${error_arch.getMessage()}"
}
Expand All @@ -240,12 +250,12 @@ pipeline {
echo "Failed to comment on PR: ${error_comment.getMessage()}"
}
} else {
echo "No error logs found for failed cases in $HOME/RUNTESTS/${pslot}_error.logs"
echo "No error logs found for failed cases in $CUSTOM_WORKSPACE/RUNTESTS/${pslot}_error.logs"
}
STATUS = 'Failed'
try {
sh(script: """${GH} pr edit ${env.CHANGE_ID} --repo ${repo_url} --remove-label "CI-${Machine}-Running" --add-label "CI-${Machine}-${STATUS}" """, returnStatus: true)
sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "Experiment ${Case} **FAILED** on ${Machine} in\n\\`${HOME}/RUNTESTS/${pslot}\\`" """)
sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "Experiment ${Case} **FAILED** on ${Machine} in\n\\`${CUSTOM_WORKSPACE}/RUNTESTS/${pslot}\\`" """)
} catch (Exception e) {
echo "Failed to update label from Running to ${STATUS}: ${e.getMessage()}"
}
Expand All @@ -259,19 +269,30 @@ pipeline {
}
}
stage( '5. FINALIZE' ) {
when {
expression {
STATUS == 'Passed'
}
}
agent { label "${machine}-emc" }
agent { label NodeName[machine].toLowerCase() }
steps {
script {
try {
sh(script: """${GH} pr edit ${env.CHANGE_ID} --repo ${repo_url} --remove-label "CI-${Machine}-Running" --remove-label "CI-${Machine}-Building" --add-label "CI-${Machine}-${STATUS}" """, returnStatus: true)
sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "**CI ${STATUS}** ${Machine} at <br>Built and ran in directory \\`${HOME}\\`" """, returnStatus: true)
} catch (Exception e) {
echo "Failed to update label from Running to ${STATUS}: ${e.getMessage()}"
sh(script: """
labels=\$(gh pr view ${env.CHANGE_ID} --repo ${repo_url} --json labels --jq '.labels[].name')
for label in \$labels; do
if [[ "\$label" == *"${Machine}"* ]]; then
gh pr edit ${env.CHANGE_ID} --repo ${repo_url} --remove-label "\$label"
fi
done
""", returnStatus: true)
sh(script: """${GH} pr edit ${env.CHANGE_ID} --repo ${repo_url} --add-label "CI-${Machine}-${STATUS}" """, returnStatus: true)
if (fileExists("${CUSTOM_WORKSPACE}/RUNTESTS/ci-run_check.log")) {
sh(script: """echo "**CI ${STATUS}** ${Machine} at <br>Built and ran in directory \\`${CUSTOM_WORKSPACE}\\`\n\\`\\`\\`\n" | cat - ${CUSTOM_WORKSPACE}/RUNTESTS/ci-run_check.log > temp && mv temp ${CUSTOM_WORKSPACE}/RUNTESTS/ci-run_check.log""", returnStatus: true)
sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body-file ${CUSTOM_WORKSPACE}/RUNTESTS/ci-run_check.log """, returnStatus: true)
}
if (STATUS == 'Passed') {
try {
sh(script: "rm -Rf ${CUSTOM_WORKSPACE}/*")
} catch (Exception e) {
echo "Failed to remove custom work directory ${CUSTOM_WORKSPACE} on ${Machine}: ${e.getMessage()}"
}
} else {
echo "Failed to build and run Global-workflow in ${CUSTOM_WORKSPACE} and on ${Machine}"
}
}
}
Expand Down
1 change: 0 additions & 1 deletion ci/cases/pr/C96_atmaerosnowDA.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,3 @@ arguments:
skip_ci_on_hosts:
- orion
- hercules
- wcoss2
2 changes: 1 addition & 1 deletion ci/scripts/check_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ for pr in ${pr_list}; do
DATE=$(date +'%D %r')
echo "Experiment ${pslot} **SUCCESS** on ${MACHINE_ID^} at ${DATE}" >> "${output_ci_single}"
echo "Experiment ${pslot} *** SUCCESS *** at ${DATE}" >> "${output_ci}"
"${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}"
# "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}"
fi
done
done
4 changes: 1 addition & 3 deletions ci/scripts/run-check_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,7 @@ while true; do

if [[ "${ROCOTO_STATE}" == "DONE" ]]; then
{
echo "Experiment ${pslot} Completed ${CYCLES_DONE} Cycles at $(date)" || true
echo "with ${SUCCEEDED} successfully completed jobs" || true
echo "Experiment ${pslot} Completed: *SUCCESS*"
echo "Experiment ${pslot} Completed ${CYCLES_DONE} Cycles: *SUCCESS* at $(date)" || true
} | tee -a "${run_check_logfile}"
rc=0
break
Expand Down
40 changes: 34 additions & 6 deletions env/GAEA.env
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,52 @@ step=$1
export launcher="srun -l --export=ALL"
export mpmd_opt="--multi-prog --output=mpmd.%j.%t.out"

export OMP_STACKSIZE=2048000
export NTHSTACK=1024000000

ulimit -s unlimited
ulimit -a

if [[ "${step}" = "fcst" ]]; then
if [[ "${step}" = "waveinit" ]]; then

export CFP_MP="YES"
if [[ "${step}" = "waveprep" ]]; then export MP_PULSE=0 ; fi
export wavempexec=${launcher}
export wave_mpmd=${mpmd_opt}

ppn="npe_node_${step}_${RUN}"
[[ -z "${!ppn+0}" ]] && ppn="npe_node_${step}"
nprocs="npe_${step}_${RUN}"
[[ -z ${!nprocs+0} ]] && nprocs="npe_${step}"
elif [[ "${step}" = "fcst" ]]; then

if [[ "${CDUMP}" =~ "gfs" ]]; then
nprocs="npe_${step}_gfs"
ppn="npe_node_${step}_gfs" || ppn="npe_node_${step}"
else
nprocs="npe_${step}"
ppn="npe_node_${step}"
fi
(( nnodes = (${!nprocs}+${!ppn}-1)/${!ppn} ))
(( ntasks = nnodes*${!ppn} ))
# With ESMF threading, the model wants to use the full node
export APRUN_UFS="${launcher} -n ${ntasks}"
unset nprocs ppn nnodes ntasks


elif [[ "${step}" = "atmos_products" ]]; then

export USE_CFP="YES" # Use MPMD for downstream product generation
export USE_CFP="YES" # Use MPMD for downstream product generation on Hera

elif [[ "${step}" = "oceanice_products" ]]; then

nth_max=$((npe_node_max / npe_node_oceanice_products))

export NTHREADS_OCNICEPOST=${nth_oceanice_products:-1}
export APRUN_OCNICEPOST="${launcher} -n 1 --cpus-per-task=${NTHREADS_OCNICEPOST}"

elif [[ "${step}" = "fit2obs" ]]; then

nth_max=$((npe_node_max / npe_node_fit2obs))

export NTHREADS_FIT2OBS=${nth_fit2obs:-1}
[[ ${NTHREADS_FIT2OBS} -gt ${nth_max} ]] && export NTHREADS_FIT2OBS=${nth_max}
export MPIRUN="${launcher} -n ${npe_fit2obs} --cpus-per-task=${NTHREADS_FIT2OBS}"

fi
10 changes: 10 additions & 0 deletions env/HERCULES.env
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,16 @@ case ${step} in
[[ ${NTHREADS_OCNANAL} -gt ${nth_max} ]] && export NTHREADS_OCNANAL=${nth_max}
export APRUN_OCNANAL="${launcher} -n ${npe_ocnanalrun} --cpus-per-task=${NTHREADS_OCNANAL}"
;;
"ocnanalecen")

export APRUNCFP="${launcher} -n \$ncmd ${mpmd_opt}"

nth_max=$((npe_node_max / npe_node_ocnanalecen))

export NTHREADS_OCNANALECEN=${nth_ocnanalecen:-${nth_max}}
[[ ${NTHREADS_OCNANALECEN} -gt ${nth_max} ]] && export NTHREADS_OCNANALECEN=${nth_max}
export APRUN_OCNANALECEN="${launcher} -n ${npe_ocnanalecen} --cpus-per-task=${NTHREADS_OCNANALECEN}"
;;
"ocnanalchkpt")

export APRUNCFP="${launcher} -n \$ncmd ${mpmd_opt}"
Expand Down
5 changes: 0 additions & 5 deletions jobs/rocoto/aeroanlfinal.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,6 @@ status=$?
export job="aeroanlfinal"
export jobid="${job}.$$"

###############################################################
# setup python path for workflow utilities and tasks
wxflowPATH="${HOMEgfs}/ush/python:${HOMEgfs}/ush/python/wxflow/src"
PYTHONPATH="${PYTHONPATH:+${PYTHONPATH}:}${wxflowPATH}"
export PYTHONPATH
###############################################################
# Execute the JJOB
"${HOMEgfs}/jobs/JGLOBAL_AERO_ANALYSIS_FINALIZE"
Expand Down
6 changes: 0 additions & 6 deletions jobs/rocoto/aeroanlinit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,6 @@ status=$?
export job="aeroanlinit"
export jobid="${job}.$$"

###############################################################
# setup python path for workflow utilities and tasks
wxflowPATH="${HOMEgfs}/ush/python:${HOMEgfs}/ush/python/wxflow/src"
PYTHONPATH="${PYTHONPATH:+${PYTHONPATH}:}${wxflowPATH}"
export PYTHONPATH

###############################################################
# Execute the JJOB
"${HOMEgfs}/jobs/JGLOBAL_AERO_ANALYSIS_INITIALIZE"
Expand Down
6 changes: 0 additions & 6 deletions jobs/rocoto/aeroanlrun.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,6 @@ status=$?
export job="aeroanlrun"
export jobid="${job}.$$"

###############################################################
# setup python path for workflow utilities and tasks
wxflowPATH="${HOMEgfs}/ush/python:${HOMEgfs}/ush/python/wxflow/src"
PYTHONPATH="${PYTHONPATH:+${PYTHONPATH}:}${wxflowPATH}"
export PYTHONPATH

###############################################################
# Execute the JJOB
"${HOMEgfs}/jobs/JGLOBAL_AERO_ANALYSIS_RUN"
Expand Down
5 changes: 0 additions & 5 deletions jobs/rocoto/atmanlfinal.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,6 @@ status=$?
export job="atmanlfinal"
export jobid="${job}.$$"

###############################################################
# setup python path for workflow utilities and tasks
wxflowPATH="${HOMEgfs}/ush/python:${HOMEgfs}/ush/python/wxflow/src"
PYTHONPATH="${PYTHONPATH:+${PYTHONPATH}:}${wxflowPATH}"
export PYTHONPATH
###############################################################
# Execute the JJOB
"${HOMEgfs}/jobs/JGLOBAL_ATM_ANALYSIS_FINALIZE"
Expand Down
Loading

0 comments on commit 5a96c83

Please sign in to comment.