Source code for psij.executors.batch.batch_scheduler_executor

import logging
import os
import subprocess
import time
import traceback
from abc import abstractmethod
from datetime import timedelta
from pathlib import Path
from threading import Thread, RLock
from typing import Optional, List, Dict, Collection, cast, Union, IO

from psij.launchers.script_based_launcher import ScriptBasedLauncher

from psij import JobExecutor, JobExecutorConfig, Launcher, Job, SubmitException, \
    JobStatus, JobState
from psij.executors.batch.template_function_library import ALL as FUNCTION_LIBRARY

UNKNOWN_ERROR = 'PSIJ: Unknown error'

logger = logging.getLogger(__name__)


[docs]def check_status_exit_code(command: str, exit_code: int, out: str) -> None:
    """Check if exit_code is nonzero and, if so, raise a RuntimeError.

    This function produces a somewhat user-friendly exception message that combines
    the command that was run with its output.

    Parameters
    ----------
    command
        The command that was run. This is only used to format the error message.
    exit_code
        The exit code returned by running the ``command``.
    out:
        The output produced by ``command``.
    """
    if exit_code != 0:
        raise RuntimeError(f'status command {command!r} exited '
                           f'with {exit_code} and output {out!r}')


def _attrs_to_mustache(job: Job) -> Dict[str, Union[object, List[Dict[str, object]]]]:
    assert job.spec is not None
    if not job.spec.attributes or not job.spec.attributes._custom_attributes:
        return {}

    r: Dict[str, Union[object, List[Dict[str, object]]]] = {}

    for k, v in job.spec.attributes._custom_attributes.items():
        ks = k.split('.', maxsplit=1)
        if len(ks) == 2:
            # always use lower case here
            ks[0] = ks[0].lower()
            if ks[0] not in r:
                r[ks[0]] = []
            cast(List[Dict[str, object]], r[ks[0]]).append({'key': ks[1], 'value': v})
        else:
            r[k] = v
    return r


def _env_to_mustache(job: Job) -> List[Dict[str, str]]:
    assert job.spec is not None
    if not job.spec.environment:
        return []

    r = []
    for k, v in job.spec.environment.items():
        r.append({'name': k, 'value': v})
    return r


[docs]class BatchSchedulerExecutorConfig(JobExecutorConfig):
    """A base configuration class for :class:`~BatchSchedulerExecutor` implementations.

    When subclassing :class:`~BatchSchedulerExecutor`, specific configuration classes inheriting
    from this class should be defined, even if empty.
    """

    def __init__(self, launcher_log_file: Optional[Path] = None,
                 work_directory: Optional[Path] = None, queue_polling_interval: int = 30,
                 initial_queue_polling_delay: int = 2,
                 queue_polling_error_threshold: int = 2,
                 keep_files: bool = False):
        """
        Parameters
        ----------
        launcher_log_file
            See :class:`~psij.JobExecutorConfig`.
        work_directory
            See :class:`~psij.JobExecutorConfig`.
        queue_polling_interval
            an interval, in seconds, at which the batch scheduler queue will be polled for updates
            to jobs.
        initial_queue_polling_delay
            the time to wait before polling the queue for the first time; for quick tests that only
            submit a short job that completes nearly instantly or for jobs that fail very quickly,
            this can dramatically reduce the time taken to get the necessary job status update.
        queue_polling_error_threshold
            The number of times consecutive queue polls have to fail in order for the executor to
            report them as job failures.
        keep_files
            Whether to keep submit files and auxiliary job files (exit code and output files) after
            a job has completed.
        """
        super().__init__(work_directory, launcher_log_file)
        self.queue_polling_interval = queue_polling_interval
        self.initial_queue_polling_delay = initial_queue_polling_delay
        self.queue_polling_error_threshold = queue_polling_error_threshold
        self.keep_files = keep_files
        if 'PSIJ_BATCH_KEEP_FILES' in os.environ:
            self.keep_files = True


[docs]class InvalidJobStateError(Exception):
    """An exception that signals that a job cannot be cancelled due to it being already done."""

    pass


[docs]class BatchSchedulerExecutor(JobExecutor):
    """A base class for batch scheduler executors.

    This class implements a generic :class:`~psij.JobExecutor` that interacts with batch schedulers.
    There are two main components to the executor: job submission and queue polling. Submission
    is implemented by generating a submit script which is then fed to the queuing system submit
    command.

    The submit script is generated using a :func:`~generate_submit_script`. An implementation of
    this functionality based on Mustache/Pystache (see https://mustache.github.io/ and
    https://pypi.org/project/pystache/) exists in :class:`~.TemplatedScriptGenerator`. This class
    can be instantiated by concrete implementations of a batch scheduler executor and the submit
    script generation can be delegated to that instance, which has a method whose signature matches
    that of :func:`~generate_submit_script`. Besides an opened file which points to where the
    contents of the submit script are to be written, the parameters to
    :func:`~generate_submit_script` are the :class:`~psij.Job` that is being submitted and a
    `context`, which is a dictionary with the following structure::

        {
            'job': <the job being submitted>
            'psij': {
                'lib': <dict; function library>,
                'launch_command': <str; launch command>,
                'script_dir': <str; directory where the submit script is generated>
            }
        }

    The *script directory* is a directory (typically `~/.psij/work`) where submit scripts are
    written; it is also used for auxiliary files, such as the *exit code file* (see below) or the
    *script output file*.

    The *launch command* is a list of strings which the script generator should render as the
    command to execute. It wraps the job executable in the proper :class:`~psij.Launcher`.

    The function library is a dictionary mapping function names to functions for all public
    functions in the :mod:`~.template_function_library` module.

    The submit script *must* perform two essential actions:

        1. redirect the output of the executable part of the script to the *script output file*,
        which is a file in `<script_dir>` named `<native_id>.out`, where `<native_id>` is the id
        given to the job by the queuing system.

        2. store the exit code of the launch command in the *exit code file* named
        `<native_id>.ec`, also inside `<script_dir>`.

    Additionally, where appropriate, the submit script should set the environment variable named
    ``PSIJ_NODEFILE`` to point to a file containing a list of nodes that are allocated for the job,
    one per line, with a total number of lines matching the process count of the job.

    Once the submit script is generated, the executor renders the submit command using
    :func:`~get_submit_command` and executes it. Its output is then parsed using
    :func:`~job_id_from_submit_output` to retrieve the `native_id` of the job. Subsequently, the
    job is registered with the queue polling thread.

    The queue polling thread regularly polls the batch scheduler queue for updates to job states.
    It builds the command for polling the queue using :func:`~get_status_command`, which takes a
    list of `native_id` strings corresponding to all registered jobs. Implementations are
    strongly encouraged to restrict the query of job states to the specified jobs in order to reduce
    the load on the queuing system. The output of the status command is then parsed using
    :func:`~parse_status_output` and the status of each job is updated accordingly. If the status
    of a registered job is not found in the output of the queue status command, it is assumed
    completed (or failed, depending on its exit code), since most queuing systems automatically
    purge completed jobs from their databases after a short period of time. The exit code is read
    from the exit code file, as described above. If the exit code value is not zero, the job is
    assumed failed and an attempt is made to read an error message from the *script output file*.
    """

    def __init__(self, url: Optional[str] = None,
                 config: Optional[BatchSchedulerExecutorConfig] = None):
        """
        Parameters
        ----------
        url
            An optional URL pointing to a specific backend
        config
            An configuration for this executor instance; if none is specified, a default
            configuration is used.
        """
        super().__init__(url=url, config=config if config else BatchSchedulerExecutorConfig())
        assert config
        self.work_directory = config.work_directory / self.name
        self._queue_poll_thread = self._start_queue_poll_thread()

    def _ensure_work_dir(self) -> None:
        self.work_directory.mkdir(parents=True, exist_ok=True)

[docs]    def submit(self, job: Job) -> None:
        """See :func:`~psij.JobExecutor.submit`."""
        logger.info('Job %s: submitting', job.id)
        self._ensure_work_dir()

        self._check_job(job)

        context = self._create_script_context(job)

        # assumes job ids are unique
        submit_file_path = self.work_directory / (job.id + '.job')
        with submit_file_path.open('w') as submit_file:
            self.generate_submit_script(job, context, submit_file)
        try:
            logger.debug('Job %s: running submit command', job.id)
            out = self._run_command(self.get_submit_command(job, submit_file_path))
            logger.debug('Job %s: submit command output: %s', job.id, out)
            job._native_id = self.job_id_from_submit_output(out)
            logger.info('Job %s: native id: %s', job.id, job.native_id)
            self._set_job_status(job, JobStatus(JobState.QUEUED,
                                                metadata={'native_id': job.native_id}))
        except subprocess.CalledProcessError as ex:
            if logger.isEnabledFor(logging.DEBUG):
                with submit_file_path.open('r') as submit_file:
                    script = submit_file.read()
                logger.debug('Job %s: submit script is: %s' % (job.id, script))

            raise SubmitException(ex.output) from None

        self._queue_poll_thread.register_job(job)

    def _get_launcher_from_job(self, job: Job) -> Launcher:
        assert job.spec
        launcher_name = job.spec.launcher
        if not launcher_name:
            launcher_name = Launcher.DEFAULT_LAUNCHER_NAME
        return self._get_launcher(launcher_name)

[docs]    def cancel(self, job: Job) -> None:
        """Cancels a job if it has not otherwise completed.

        A command is constructed using :func:`~get_cancel_command` and executed in order to cancel
        the job. Also see :func:`~psij.JobExecutor.cancel`.
        """
        if job.native_id is None:
            raise SubmitException('Job does not have a native ID.')
        if job.status.state.final:
            return
        try:
            self._run_command(self.get_cancel_command(job.native_id))
        except subprocess.CalledProcessError as ex:
            try:
                self.process_cancel_command_output(ex.returncode, ex.output)
            except InvalidJobStateError:
                # do nothing; the job has completed anyway
                pass
            except SubmitException:
                # re-raise
                raise

[docs]    def attach(self, job: Job, native_id: str) -> None:
        """Attaches a job to a native job.

        Attempts to connect `job` to a native job with `native_id` such that the job correctly
        reflects updates to the status of the native job. If the native job was previously
        submitted using this executor (hence having an *exit code file* and a *script output file*),
        the executor will attempt to retrieve the exit code and errors from the job. Otherwise, it
        may be impossible for the executor to distinguish between a failed and successfully
        completed job.

        Parameters
        ----------
        job
            The PSI/J job to attach.
        native_id
            The id of the batch scheduler job to attach to.
        """
        job._native_id = native_id
        job.executor = self
        self._queue_poll_thread.register_job(job)

[docs]    @abstractmethod
    def generate_submit_script(self, job: Job, context: Dict[str, object],
                               submit_file: IO[str]) -> None:
        """Called to generate a submit script for a job.

        Concrete implementations of batch scheduler executors must override this method in
        order to generate a submit script for a job.

        Parameters
        ----------
        job
            The job to be submitted.
        context
            A dictionary containing information about the context in which the job is being
            submitted. For details, see the description of this class.
        submit_file
            An opened file-like object to which the contents of the submit script should be
            written.
        """
        pass

[docs]    @abstractmethod
    def get_submit_command(self, job: Job, submit_file_path: Path) -> List[str]:
        """Constructs a command to submit a job to a batch scheduler.

        Concrete implementations of batch scheduler executors must override this method.

        Parameters
        ----------
        job
            The job being submitted.
        submit_file_path
            The path to a submit script generated using :func:`~generate_submit_script`.

        Returns
        -------
        A list of strings representing the command and arguments to execute in order to submit
        the job, such as `['qsub', str(submit_file_path)]`.
        """
        pass

[docs]    @abstractmethod
    def job_id_from_submit_output(self, out: str) -> str:
        """Extracts a native job id from the output of the submit command.

        Concrete implementations of batch scheduler executors must override this method. This
        method is only invoked if the submit command completes with a zero exit code, so
        implementations of this method do not need to determine whether the output reflects an
        error from the submit command.

        Parameters
        ----------
        out
            The output from the submit command.

        Returns
        -------
        A string representing the native id of the newly submitted job.
        """
        pass

[docs]    @abstractmethod
    def get_cancel_command(self, native_id: str) -> List[str]:
        """Constructs a command to cancel a batch scheduler job.

        Concrete implementations of batch scheduler executors must override this method.

        Parameters
        ----------
        native_id
            The native id of the job being cancelled.

        Returns
        -------
        A list of strings representing the command and arguments to execute in order to cancel
        the job, such as, e.g., `['qdel', native_id]`.
        """
        pass

[docs]    @abstractmethod
    def process_cancel_command_output(self, exit_code: int, out: str) -> None:
        """Handle output from a failed cancel command.

        The main purpose of this method is to help distinguish between the cancel command
        failing due to an invalid job state (such as the job having completed before the cancel
        command was invoked) and other types of errors. Since job state errors are ignored, there
        are two options:

        1. Instruct the cancel command to not fail on invalid state errors and have this
        method always raise a :class:`~psij.exceptions.SubmitException`, since it is only invoked
        on "other" errors.

        2. Have the cancel command fail on both invalid state errors and other errors and
        interpret the output from the cancel command to distinguish between the two and raise
        the appropriate exception.

        Parameters
        ----------
        exit_code
            The exit code from the cancel command.
        out
            The output from the cancel command.

        Raises
        ------
        InvalidJobStateError
            Raised if the job cancellation has failed because the job was in a completed or failed
            state at the time when the cancellation command was invoked.
        SubmitException
            Raised for all other reasons.
        """
        pass

[docs]    @abstractmethod
    def get_status_command(self, native_ids: Collection[str]) -> List[str]:
        """Constructs a command to retrieve the status of a list of jobs.

        Concrete implementations of batch scheduler executors must override this method. In order
        to prevent overloading the queueing system, concrete implementations are strongly
        encouraged to return a command that only queries for the status of the indicated jobs. The
        command returned by this method should produce an output that is understood by
        :func:`~parse_status_output`.

        Parameters
        ----------
        jobs
            A collection of native ids corresponding to the jobs whose status is sought.

        Returns
        -------
        A list of strings representing the command and arguments to execute in order to get the
        status of the jobs.
        """
        pass

[docs]    @abstractmethod
    def parse_status_output(self, exit_code: int, out: str) -> Dict[str, JobStatus]:
        """Parses the output of a job status command.

        Concrete implementations of batch scheduler executors must override this method. The output
        is meant to have been produced by the command generated by :func:`~get_status_command`.

        Parameters
        ----------
        out
            The string output of the status command as prescribed by :func:`~get_status_command`.
        Returns
        -------
        A dictionary mapping native job ids to :class:`~psij.JobStatus` objects. The
        implementation of this method need not process the *exit code file* or the *script
        output file* since it is done by the base `BatchSchedulerExecutor` implementation.
        """
        pass

[docs]    @abstractmethod
    def get_list_command(self) -> List[str]:
        """Constructs a command to retrieve the list of jobs known to the LRM for the current user.

        Concrete implementations of batch scheduler executors must override this method. Upon
        running the command, the output can be parsed with :func:`~parse_list_output`.

        Returns
        -------
        A list of strings representing the executable and arguments to invoke in order to obtain
        the list of jobs the LRM knows for the current user.
        """
        pass

[docs]    def parse_list_output(self, out: str) -> List[str]:
        """Parses the output of the command obtained from :func:`~get_list_command`.

        The default implementation of this method assumes that the output has no header and
        consists of native IDs, one per line, possibly surrounded by whitespace. Concrete
        implementations should override this method if a different format is expected.

        Parameters
        ----------
        out
            The output from the "list" command as returned by :func:`~get_list_command`.
        Returns
        -------
        A list of strings representing the native IDs of the jobs known to the LRM for the current
        user.
        """
        return [s.strip() for s in out.splitlines()]

    def _create_script_context(self, job: Job) -> Dict[str, object]:
        launcher = self._get_launcher_from_job(job)
        if isinstance(launcher, ScriptBasedLauncher) and logger.isEnabledFor(logging.DEBUG):
            log_file = str((self.work_directory / (job.id + '_launcher.log')).absolute())
            launch_command = launcher.get_launch_command(job, log_file=log_file)
        else:
            launch_command = launcher.get_launch_command(job)
        logger.debug('Launch command: %s', launch_command)

        ctx = {
            'job': job,
            'custom_attributes': _attrs_to_mustache(job),
            'env': _env_to_mustache(job),
            'psij': {
                'lib': FUNCTION_LIBRARY,
                'launch_command': launch_command,
                'script_dir': str(self.work_directory)
            }
        }
        assert job.spec is not None
        if job.spec.attributes:
            duration = job.spec.attributes.duration
            if duration is not None:
                ctx['formatted_job_duration'] = self._format_duration(duration)
        return ctx

    def _format_duration(self, d: timedelta) -> str:
        # the default is hh:mm:ss, with hh not limited to 24; this is the least ambiguous
        # choice
        return '%s:%s:%s' % (int(d.total_seconds()) // 3600, (d.seconds // 60) % 60, d.seconds % 60)

    def _run_command(self, cmd: List[str]) -> str:
        res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        if res.returncode != 0:
            msg = ''
            if res.stdout:
                msg += res.stdout
            if res.stderr:
                if msg != '':
                    msg += '\n'
                msg += res.stderr
            if msg == '':
                msg = UNKNOWN_ERROR
            raise subprocess.CalledProcessError(res.returncode, cmd, output=msg)
        else:
            return res.stdout

    def _start_queue_poll_thread(self) -> '_QueuePollThread':
        qp_thread = _QueuePollThread(self.name + ' Queue Polling Thread',
                                     cast(BatchSchedulerExecutorConfig, self.config), self)
        qp_thread.start()
        return qp_thread

    def _set_job_status(self, job: Job, status: JobStatus) -> None:
        if status.state.is_greater_than(job.status.state) is False:
            # is_greater_than returns T/F if the states are comparable and None if not, so
            # we have to check explicitly for the boolean value rather than truthiness
            return
        if status.state.final and job.native_id:
            self._clean_submit_script(job)
            self._read_aux_files(job, status)
        super()._set_job_status(job, status)

    def _clean_submit_script(self, job: Job) -> None:
        try:
            assert isinstance(self.config, BatchSchedulerExecutorConfig)
            if not self.config.keep_files:
                submit_file_path = self.work_directory / (job.id + '.job')
                try:
                    submit_file_path.unlink()
                except FileNotFoundError:
                    # this can reasonably happen for attached jobs when the main
                    # job cleans up the script instead
                    pass
        except Exception as ex:
            logger.warning('Job %s: failed clean submit script: %s', job.id, ex)

    def _read_aux_files(self, job: Job, status: JobStatus) -> None:
        try:
            if logger.isEnabledFor(logging.DEBUG):
                launcher_log = self._read_aux_file(path=self.work_directory
                                                   / (job.id + '_launcher.log'))
                if launcher_log is not None:
                    logger.debug('Job %s: launcher log: %s', job.id, launcher_log)
            if status.state == JobState.CANCELED:
                # exit code and other things are not very meaningful for canceled jobs
                return
            # read exit code and output files
            exit_code_str = self._read_aux_file(job, '.ec')
            if exit_code_str:
                status.exit_code = int(exit_code_str)
                if status.exit_code != 0:
                    status.state = JobState.FAILED
            if status.state == JobState.FAILED:

                if status.message is None:
                    # only read output from submit script if another error message is not
                    # already present
                    out = self._read_aux_file(job, '.out')
                    if out:
                        launcher = self._get_launcher_from_job(job)
                        if launcher.is_launcher_failure(out):
                            status.message = launcher.get_launcher_failure_message(out)
                    logger.debug('Output from launcher: %s', status.message)
                else:
                    self._delete_aux_file(job, '.out')

        except Exception as ex:
            logger.warning('Job %s: failed to read auxiliary files: %s', job.id, ex)

    def _read_aux_file(self, job: Optional[Job] = None, suffix: Optional[str] = None,
                       path: Optional[Path] = None) -> Optional[str]:
        if path is None:
            assert job
            assert job.native_id
            assert suffix
            path = self.work_directory / (job.native_id + suffix)
        logger.debug('Attempting to read %s', path)
        if path.exists():
            try:
                with open(path) as f:
                    return f.read()
            finally:
                self._delete_aux_file(job=job, suffix=suffix, path=path, force=True)
        else:
            logger.debug('%s does not exist', path)
            return None

    def _delete_aux_file(self, job: Optional[Job] = None, suffix: Optional[str] = None,
                         path: Optional[Path] = None, force: bool = False) -> None:
        assert isinstance(self.config, BatchSchedulerExecutorConfig)
        if self.config.keep_files:
            return
        if path is None:
            assert job
            assert job.native_id
            assert suffix
            path = self.work_directory / (job.native_id + suffix)
        if force or path.exists():
            try:
                path.unlink()
            except FileNotFoundError:
                pass  # see above; attached job may race with original job

[docs]    def list(self) -> List[str]:
        """Returns a list of jobs known to the underlying implementation.

        See :func:`~psij.JobExecutor.list`.
        The returned list is a list of `native_id` strings representing jobs known to the
        underlying batch scheduler implementation, whether submitted through this executor or not.
        Implementations are encouraged to restrict the results to jobs accessible by the current
        user.
        """
        return self.parse_list_output(self._run_command(self.get_list_command()))

    def _current_user(self) -> str:
        return os.getlogin()


class _QueuePollThread(Thread):
    def __init__(self, name: str, config: BatchSchedulerExecutorConfig,
                 executor: BatchSchedulerExecutor):
        super().__init__()
        self.name = name
        self.daemon = True
        self.config = config
        self.executor = executor
        # native_id -> job
        self._jobs: Dict[str, List[Job]] = {}
        # counts consecutive errors while invoking qstat or equivalent
        self._poll_error_count = 0
        self._jobs_lock = RLock()

    def run(self) -> None:
        logger.debug('Executor %s: queue poll thread started', self.executor)
        time.sleep(self.config.initial_queue_polling_delay)
        while True:
            self._poll()
            time.sleep(self.config.queue_polling_interval)

    def _poll(self) -> None:
        with self._jobs_lock:
            if len(self._jobs) == 0:
                return
            jobs_copy = dict(self._jobs)
        logger.info('Polling for %s jobs', len(jobs_copy))
        try:
            out = self.executor._run_command(self.executor.get_status_command(jobs_copy.keys()))
        except subprocess.CalledProcessError as ex:
            out = ex.output
            exit_code = ex.returncode
        except Exception as ex:
            self._handle_poll_error(True,
                                    ex,
                                    f'Failed to poll for job status: {traceback.format_exc()}')
            return
        else:
            exit_code = 0
            self._poll_error_count = 0
        logger.debug('Output from status command: %s', out)
        try:
            status_map = self.executor.parse_status_output(exit_code, out)
        except Exception as ex:
            self._handle_poll_error(False,
                                    ex,
                                    f'Failed to poll for job status: {traceback.format_exc()}')
            return
        try:
            for native_id, job_list in jobs_copy.items():
                try:
                    status = self._get_job_status(native_id, status_map)
                except Exception:
                    status = JobStatus(JobState.FAILED,
                                       message='Failed to update job status: %s' %
                                               traceback.format_exc())
                for job in job_list:
                    self.executor._set_job_status(job, status)
                if status.state.final:
                    with self._jobs_lock:
                        del self._jobs[native_id]
        except Exception as ex:
            msg = traceback.format_exc()
            self._handle_poll_error(True, ex, 'Error updating job statuses {}'.format(msg))

    def _get_job_status(self, native_id: str, status_map: Dict[str, JobStatus]) -> JobStatus:
        if native_id in status_map:
            return status_map[native_id]
        else:
            return JobStatus(JobState.COMPLETED)

    def _handle_poll_error(self, immediate: bool, ex: Exception, msg: str) -> None:
        logger.warning('Polling error: %s', msg)
        self._poll_error_count += 1
        if immediate or (self._poll_error_count > self.config.queue_polling_error_threshold):
            self._poll_error_count = 0
            # fail all jobs
            with self._jobs_lock:
                # We should only poll if there is at least one job, so we should not be in a
                # situation when we polled and there were no jobs to poll for
                # Internal errors are a bit different, since they could, in principle, occur
                # after the last job was processed and removed from self._jobs; in practice,
                # the code in _poll has the job removal from _jobs as the last possible step
                assert len(self._jobs) > 0
                jobs_copy = dict(self._jobs)
                self._jobs.clear()
            for job_list in jobs_copy.values():
                for job in job_list:
                    self.executor._set_job_status(job, JobStatus(JobState.FAILED, message=msg))

    def register_job(self, job: Job) -> None:
        assert job.native_id
        logger.info('Job %s: registering', job.id)
        with self._jobs_lock:
            native_id = job.native_id
            if native_id not in self._jobs:
                self._jobs[native_id] = [job]
            else:
                self._jobs[job.native_id].append(job)