#!/bin/bash

#
# Unused functions
notif_on_change()
{
    NVIDIA_CAPS_PATH="/dev/nvidia-caps"
    inotifywait --event create,move,delete "${NVIDIA_CAPS_PATH}"
}

exec_on_change()
{
    echo "Updating containerd override and restart docker daemon"
    create_containerd_override || echo "Failed to create containerd override"
    systemctl restart docker || echo "Failed to restart docker"
}
# End unused functions
#

create_containerd_override()
{
    # If nvidia-smi is not found raise err and exit
    if [ ! -f /usr/bin/nvidia-smi ]; then
        echo "nvidia-smi not found. Returning error."
        return 1
    fi

    MINOR_NUM_3D_GPU="/tmp/NV_3D_GPU_MINOR_NUMBERS.txt"
    MINOR_NUM_ALL_GPU="/tmp/NV_ALL_GPU_MINOR_NUMBERS.txt"
    rm -f ${MINOR_NUM_3D_GPU} > /dev/null 2>&1
    rm -f ${MINOR_NUM_ALL_GPU} > /dev/null 2>&1

    # Retrieve all 3D controller class NVIDIA GPUs
    NV_3D_GPU_BDF_LIST=`lspci | grep "3D controller" | grep -i NVIDIA | cut -d' ' -f1`
    for BDF in `echo ${NV_3D_GPU_BDF_LIST}`; do
        MINOR_NUMBER=`nvidia-smi -q -i $BDF | grep "Minor Number" | awk '{print $4}'`
        echo $MINOR_NUMBER >> ${MINOR_NUM_3D_GPU}
    done

    if [[ -f ${MINOR_NUM_3D_GPU} && ! -s ${MINOR_NUM_3D_GPU} ]]; then
        rm -rf ${MINOR_NUM_3D_GPU} ${MINOR_NUM_ALL_GPU} || echo "Failed to remove temporary files"
        return 1
    fi

    # Retrive Minor Numbers of all Nvidia GPUs
    nvidia-smi -q | grep Minor | awk '{print $4}' > ${MINOR_NUM_ALL_GPU}

    # Get Minor Numbers of the Nvidia GPU which are NOT of Class 3D controller
    OUT=`comm -13  <(sort ${MINOR_NUM_3D_GPU}) <(sort ${MINOR_NUM_ALL_GPU})`

    # Determine OS type
    . /etc/os-release

    if [[ ($ID == rhel || $ID == centos) && $VERSION == 7* ]]; then
        # For RHEL 7 and CentOS 7 modify docker-override.conf
        CONF_DIR="/etc/systemd/system/docker.service.d"
        CONF_FILE="$CONF_DIR/docker-override.conf"

        # Make sure the file exists
        if [[ ! -f ${CONF_FILE} ]]; then
            mkdir -p $CONF_DIR > /dev/null 2>&1
            touch $CONF_FILE
            echo "[Service]" > $CONF_FILE
        fi

        # Remove any existing DeviceAllow lines
        sed -i "/DeviceAllow/ d" $CONF_FILE
    else
        # For everything else remove old containerd-override.conf and create a new one
        CONF_DIR="/etc/systemd/system/containerd.service.d"
        CONF_FILE="$CONF_DIR/containerd-override.conf"

        # Remove if there exist any containerd-override.conf
        rm -f $CONF_FILE > /dev/null 2>&1
        mkdir -p $CONF_DIR > /dev/null 2>&1
        touch $CONF_FILE
        echo "[Service]" > $CONF_FILE
    fi

    # Add all /dev/nvidia* as DeviceAllow
    DEV_NVIDIA=`find /dev/ -name \* -type c | grep nvidia | grep -v nvidia-caps | sort`
    for DEV in `echo ${DEV_NVIDIA}`; do
        sed -i "$ aDeviceAllow=${DEV}" $CONF_FILE
    done

    # Add all nvidia-caps devices
    sed -i "$ aDeviceAllow=char-nvidia-caps" $CONF_FILE

    # Remove GPUs which are NOT of Class '3D Controller'
    for NUMBER in `echo ${OUT}`; do
        sed -i "/\bnvidia${NUMBER}\b/ d" $CONF_FILE
    done

    echo "Successfully configured nvidia docker to only use Compute GPUs"

    systemctl daemon-reload

    rm -rf ${MINOR_NUM_3D_GPU} ${MINOR_NUM_ALL_GPU} || echo "Failed to remove temporary files"
    return 0
}

### MAIN ###
if [[ $EUID -ne 0 ]]; then
    echo "Requires sudo to run"
    exit 1
fi

plat_funcs="/usr/local/sbin/nv_scripts/plat_funcs.bash"
. ${plat_funcs}
prodname=$(get_system_product_name)

if plat_needs_containerd_override; then
    # Create the override file initially
    create_containerd_override

    # [bug 200744004]: No need to check on /dev/nvidia-caps anymore.  We are
    # already allowing all such devices by using "DeviceAllow=char-nvidia-caps"
    # Main loop to look for changes in GPU topology
    #while notif_on_change; do
    #    exec_on_change
    #done
fi

exit 0
