#!/bin/bash # enable bash debugging KMDBSCRIPT="${KMDBSCRIPT:=0}" if [ $KMDBSCRIPT == "1" ] then set -x fi # directory where files are dumped KMDUMPDIR="${KMDUMPDIR:=.}" # dump the isa KMDUMPISA="${KMDUMPISA:=0}" # determine if we want to use ThinLTO KMTHINLTO="${KMTHINLTO:=0}" # check number of arguments if [ "$#" -eq 0 ]; then echo "$0 is NOT recommended to be directly used" >&2 exit 1 fi # enable bundle dumping KMDUMPBUNDLE="${KMDUMPBUNDLE:=0}" # flag for function calls enabled AMDGPU_FUNC_CALLS="0" # flag for COV3 HCC_COV3="0" # flag for optimisation level HCC_OPT="" BINDIR=$(dirname $0) LINK=$BINDIR/llvm-link LTO=$BINDIR/llvm-lto OPT=$BINDIR/opt CLAMP_DEVICE=$BINDIR/clamp-device LLD=$BINDIR/ld.lld CLAMP_EMBED=$BINDIR/clamp-embed CLANG_OFFLOAD_BUNDLER=$BINDIR/clang-offload-bundler ################ # Verbose flag ################ VERBOSE=0 ################ # GPU targets # This array could be populated via 3 methods: # 1) clang driver adds multiple --amdgpu-target= options # 2) HCC_AMDGPU_TARGET env var, multiple targets are delimited by ",". # Example: # # export HCC_AMDGPU_TARGET=gfx701,gfx801,gfx802,gfx803 # # ISA for Hawaii(gfx701), Carrizo(gfx801), Tonga(gfx802) and Fiji(gfx803) # # would be produced # hcc `hcc-config --cxxflags --ldflags` foo.cpp # # 3) CMake AMDGPU_TARGET config value. This is the last resort options. ################ AMDGPU_TARGET_ARRAY=() ################ # link ################ LINK_KERNEL_ARGS=() LINK_HOST_ARGS=() LINK_CPU_ARG=() LINK_OTHER_ARGS=() TEMP_DIR=`mktemp -d` # a file which contains the list of __cxxamp_serialize symbols in each CPU object file CXXAMP_SERIALIZE_SYMBOL_FILE=$TEMP_DIR/symbol.txt touch $CXXAMP_SERIALIZE_SYMBOL_FILE # determine `file` output per host architecture if [[ "@CMAKE_SYSTEM_PROCESSOR@" == "x86_64" ]]; then OBJ_FILE_FORMAT_STRING="ELF 64-bit LSB relocatable, x86-64" elif [[ "@CMAKE_SYSTEM_PROCESSOR@" == "aarch64" ]]; then OBJ_FILE_FORMAT_STRING="ELF 64-bit LSB relocatable, ARM aarch64" elif [[ "@CMAKE_SYSTEM_PROCESSOR@" == "ppc64" ]]; then OBJ_FILE_FORMAT_STRING="ELF 64-bit LSB relocatable, ppc64" elif [[ "@CMAKE_SYSTEM_PROCESSOR@" == "ppc64le" ]]; then OBJ_FILE_FORMAT_STRING="ELF 64-bit LSB relocatable, ppc64le" else OBJ_FILE_FORMAT_STRING="ELF 64-bit LSB relocatable" fi # find object file _find_object() { local FILE=$1 local ret=${FILE%.o} if [ -e $FILE ]; then local file_output=`file $FILE | grep "$OBJ_FILE_FORMAT_STRING"` local readelf_output=`readelf -h $FILE 2>&1 | grep 'Relocatable file'` if [ ! -z "$file_output" ] && [ ! -z "$readelf_output" ]; then # remove postfix ret=${FILE%.*} fi fi echo $ret } ARGS="$@" STATIC_LIB_LIST=() TEMP_AR_DIRS=() # detect the verbose flags before doing anything if [[ "$ARGS" =~ --verbose ]]; then VERBOSE=1 fi #very verbose #VERBOSE=2 # Using ThinLTO for link-time independent kernels calling in-parallel clamp-device opt and llc _thinlto_path() { if [ $VERBOSE != 0 ]; then echo "Generating AMD GCN Kernels: ${LINK_KERNEL_ARGS[@]}" fi # create module summaries for each kernel and perform cross-module importing for KERNEL in ${LINK_KERNEL_ARGS[@]} ; do $OPT -thinlto-bc -disable-opt -module-summary $KERNEL -o $KERNEL ret=$? if [ $ret != 0 ]; then exit $ret fi done $LTO -thinlto -thinlto-action=thinlink "${LINK_KERNEL_ARGS[@]}" -o $TEMP_DIR/kernel.thinlto.index ret=$? if [ $ret != 0 ]; then exit $ret fi $LTO -thinlto -thinlto-action=import "${LINK_KERNEL_ARGS[@]}" -thinlto-index=$TEMP_DIR/kernel.thinlto.index -import-instr-limit=1048576 ret=$? if [ $ret != 0 ]; then exit $ret fi declare -a pids declare -a ISABIN_FILES MAX_LIMIT=8 # for each GPU target, lower LLVM IR kernels into GCN ISA for KERNEL in ${LINK_KERNEL_ARGS[@]} do if [ $VERBOSE != 0 ]; then echo "Processing: $KERNEL" fi # Only allow at most $MAX_LIMIT sub-processes if [ $(jobs 2>&1 | grep -c Running) -ge "$MAX_LIMIT" ] ; then for pid in ${pids[*]}; do wait $pid || ret=$((ret+$?)) done pids=("") fi # Perform clamp-device's opt and llc on each kernel individually in parallel FUNC_CALLS="" if [ $AMDGPU_FUNC_CALLS == "1" ]; then FUNC_CALLS="--amdgpu-func-calls" fi CODE_OBJECT="" if [ $HCC_COV3 == "1" ]; then CODE_OBJECT="--hcc-cov3" fi i=0 for AMDGPU_TARGET in ${AMDGPU_TARGET_ARRAY[@]} do { $CLAMP_DEVICE $KERNEL.thinlto.imported.bc $KERNEL-$AMDGPU_TARGET.isabin --amdgpu-target=$AMDGPU_TARGET $FUNC_CALLS $CODE_OBJECT $HCC_OPT ; } & # error handling pids=("${pids[@]}" "$!") # Collect the ISABIN files for later lld linking ISABIN_FILES[$i]="${ISABIN_FILES[$i]} $KERNEL-$AMDGPU_TARGET.isabin" if [ $VERBOSE != 0 ]; then echo "Constructing: $KERNEL-$AMDGPU_TARGET.isabin" fi i+=1 done done # collect all the error codes from forked processes # error handling for pid in ${pids[*]}; do wait $pid || ret=$((ret+$?)) done if [ $ret != 0 ]; then exit $ret fi i=0 for AMDGPU_TARGET in ${AMDGPU_TARGET_ARRAY[@]} ; do # Combine all .isabin files into HSACO format $LLD -shared ${ISABIN_FILES[$i]} -o $KERNEL-$AMDGPU_TARGET.hsaco ret=$? if [ $ret != 0 ]; then echo "Generating AMD GCN kernel failed in ld.lld for target: $AMDGPU_TARGET" exit $ret fi if [ $KMDUMPISA == "1" ]; then cp $KERNEL-$AMDGPU_TARGET.hsaco ${KMDUMPDIR}/dump-$AMDGPU_TARGET.hsaco ret=$? if [ $ret != 0 ]; then exit $ret fi fi if [ $VERBOSE != 0 ]; then echo "Generated: $KERNEL-$AMDGPU_TARGET.hsaco" fi # augment arguments to clang-offload-bundler CLANG_OFFLOAD_BUNDLER_INPUTS_ARGS+=",$KERNEL-$AMDGPU_TARGET.hsaco" CLANG_OFFLOAD_BUNDLER_TARGETS_ARGS+=",hcc-amdgcn-amd-amdhsa--$AMDGPU_TARGET" i+=1 done } # Using default path for link-time opt and llc in clamp-device _default_path() { # combine kernel sections together $LINK "${LINK_KERNEL_ARGS[@]}" -o $TEMP_DIR/kernel.bc ret=$? if [ $ret != 0 ]; then exit $ret fi if [ $VERBOSE == 1 ]; then echo "Generating AMD GCN kernel" fi declare -a pids # for each GPU target, lower to GCN ISA in HSACO format FUNC_CALLS="" if [ $AMDGPU_FUNC_CALLS == "1" ]; then FUNC_CALLS="--amdgpu-func-calls" fi CODE_OBJECT="" if [ $HCC_COV3 == "1" ]; then CODE_OBJECT="--hcc-cov3" fi for AMDGPU_TARGET in ${AMDGPU_TARGET_ARRAY[@]}; do { $CLAMP_DEVICE $TEMP_DIR/kernel.bc $TEMP_DIR/kernel-$AMDGPU_TARGET.hsaco --amdgpu-target=$AMDGPU_TARGET $FUNC_CALLS $CODE_OBJECT $HCC_OPT ; } & # error handling pids+=("${pids[@]}" "$!") # augment arguments to clang-offload-bundler CLANG_OFFLOAD_BUNDLER_INPUTS_ARGS+=",$TEMP_DIR/kernel-$AMDGPU_TARGET.hsaco" CLANG_OFFLOAD_BUNDLER_TARGETS_ARGS+=",hcc-amdgcn-amd-amdhsa--$AMDGPU_TARGET" done # collect all the error codes from forked processes # error handling for pid in ${pids[*]}; do wait $pid || ret=$((ret+$?)) done if [ $ret != 0 ]; then exit $ret fi } # gather a list of library search paths LIB_SEARCH_PATHS=() for ARG in "$@" do # matches -L if [[ "$ARG" =~ ^-L.* ]]; then REAL_PATH="$(readlink -f "${ARG:2}")" if [ $VERBOSE == 2 ]; then echo "add library path: ${ARG:2}, canonical path: $REAL_PATH" fi LIB_SEARCH_PATHS+=( "$REAL_PATH" ) fi done # gather input arguments from linker command files INPUT_ARGUMENTS=() for ARG in "$@" do # matches @ if [[ "$ARG" =~ ^@.* ]]; then REAL_PATH="$(readlink -f "${ARG:1}")" if [ $VERBOSE == 2 ]; then echo "add linker command path: ${ARG:1}, canonical path: $REAL_PATH" fi # read from linker command file IFS=$'\n' read -d '' -r -a LINES < "$REAL_PATH" for LINE in ${LINES[@]} do #remove the extra quote wrapping the command if [[ $LINE =~ (^[ ]*\"*)|(\"$) ]]; then LINE=`echo "$LINE" | sed -e "s/^\"*//g" | sed -e "s/\"*$//g"` if [ $VERBOSE == 2 ]; then echo "Stripped: $LINE" fi fi if [ $VERBOSE == 2 ]; then echo "add linker command: $LINE" fi INPUT_ARGUMENTS+=($LINE) done else INPUT_ARGUMENTS+=("$ARG") fi done for ARG in "${INPUT_ARGUMENTS[@]}" do case $ARG in ########################## # Parse optimisation level ########################## -O*) HCC_OPT=$ARG continue ;; ###################### # Parse AMDGPU target ###################### --amdgpu-target=*) AMDGPU_TARGET_ARRAY+=("${ARG#*=}") continue ;; --amdgpu-func-calls) AMDGPU_FUNC_CALLS="1" continue ;; --hcc-cov3) HCC_COV3="1" continue ;; ################################################ # Parse dump options and export them for called # scripts e.g. clamp-device ################################################ -dump-isa) export KMDUMPISA=1 continue ;; -dump-llvm) export KMDUMPLLVM=1 continue ;; -dump-dir=*) export KMDUMPDIR="${ARG#*=}" continue ;; esac ##################################### # detect object or static library ##################################### OBJS_TO_PROCESS=() # ELF section names for IR and ISA KERNEL_IR_SECTION=".kernel_ir" KERNEL_SECTION=".kernel" if [[ "$ARG" =~ [^[:space:]]+\.cpu$ ]]; then cp "$ARG" $TEMP_DIR/kernel_cpu.o ret=$? if [ $ret != 0 ]; then exit $ret fi LINK_CPU_ARG+=( "$TEMP_DIR/kernel_cpu.o" ) elif [[ "$ARG" =~ [^[:space:]]+\.o$ ]]; then # detected a .o file if [ $VERBOSE == 2 ]; then echo "detect object file to process further: $ARG" fi OBJS_TO_PROCESS+=( "$ARG" ) elif [[ "$ARG" =~ ^-l[^[:space:]]+$ ]] || [[ "$ARG" =~ [^[:space:]]+.a$ ]] || [[ "$ARG" =~ [^[:space:]]+.lo$ ]]; then # proccess a static library DETECTED_STATIC_LIBRARY="" # detected whether it's an -l option if [[ "$ARG" =~ ^-l[^[:space:]]+$ ]]; then # expand the option into a library name STATIC_LIB_NAME="lib${ARG:2}.a" if [ $VERBOSE == 2 ]; then echo "looking for static library $STATIC_LIB_NAME" fi # look for the static library in the library search paths for LIB_PATH in "${LIB_SEARCH_PATHS[@]}" do FULL_LIB_PATH="$LIB_PATH" FULL_LIB_PATH+="/" FULL_LIB_PATH+="$STATIC_LIB_NAME" FULL_LIB_PATH=$(readlink -f "$FULL_LIB_PATH") if [ $VERBOSE == 2 ]; then echo "trying to detect $FULL_LIB_PATH" fi if [ -f "$FULL_LIB_PATH" ]; then if [ $VERBOSE == 2 ]; then echo "$FULL_LIB_PATH detected" fi DETECTED_STATIC_LIBRARY="$FULL_LIB_PATH"; break; fi done else # this is .a or .lo static library file specified at the commad line if [ -f "$ARG" ]; then FULL_LIB_PATH=$(readlink -f "$ARG") if [ $VERBOSE == 2 ]; then echo "use .a / .lo specified at: $FULL_LIB_PATH" fi DETECTED_STATIC_LIBRARY="$FULL_LIB_PATH" fi fi # if [[ "$ARG" =~ ^-l[^[:space:]]+$ ]]; then # check for duplicated static library options if [[ $DETECTED_STATIC_LIBRARY != "" ]]; then for LIB in "${STATIC_LIB_LIST[@]}" do if [[ $LIB == $DETECTED_STATIC_LIBRARY ]]; then # this library has already been looked at, skip it DETECTED_STATIC_LIBRARY="" break; fi done if [[ $DETECTED_STATIC_LIBRARY != "" ]]; then STATIC_LIB_LIST+=( "$DETECTED_STATIC_LIBRARY" ) fi fi KERNEL_UNDETECTED="1" if [[ $DETECTED_STATIC_LIBRARY != "" ]]; then # we found a static library library if [ $VERBOSE == 2 ]; then echo "processing static library $DETECTED_STATIC_LIBRARY" fi # detect whether the objects in the static library contain a .kernel_ir section KERNEL_UNDETECTED=`objdump -t "$DETECTED_STATIC_LIBRARY" 2> /dev/null | grep -q "$KERNEL_IR_SECTION"; echo $?` if [[ $KERNEL_UNDETECTED == "0" ]]; then # .kernel_ir section detected, extract the objects from the archieve if [ $VERBOSE == 2 ]; then echo "kernel detected in $DETECTED_STATIC_LIBRARY" fi CURRENT_DIR=$PWD # extract the archive FILE=`basename $DETECTED_STATIC_LIBRARY` AR_TEMP_DIR="$TEMP_DIR" AR_TEMP_DIR+="/" AR_TEMP_DIR+="$FILE" if [ $VERBOSE == 2 ]; then echo "creating temp dir: $AR_TEMP_DIR" fi mkdir -p "$AR_TEMP_DIR" ret=$? if [ $ret != 0 ]; then exit $ret fi TEMP_AR_DIRS+=( "$AR_TEMP_DIR" ) cd "$AR_TEMP_DIR" `ar x "$DETECTED_STATIC_LIBRARY"` ret=$? if [ $ret != 0 ]; then exit $ret fi cd "$CURRENT_DIR" # store all the extract objects to process further OBJS_TO_PROCESS=($(ls "$AR_TEMP_DIR"/*.o)) fi # if [[ $KERNEL_UNDETECTED == "0" ]]; then fi # if [[ $DETECTED_STATIC_LIBRARY != "" ]]; then elif [ -f "$ARG" ]; then # an object file but doesn't have an .o extension?? file_output=`file "$ARG" | grep "$OBJ_FILE_FORMAT_STRING"` readelf_output=`readelf -h "$ARG" 2>&1 | grep 'Relocatable file'` if [ ! -z "$file_output" ] && [ ! -z "$readelf_output" ]; then OBJS_TO_PROCESS+=( "$ARG" ) fi fi # no objects to further process, pass the original args down to the host linker if [ ${#OBJS_TO_PROCESS[@]} == 0 ]; then # no objects to further process, pass the original args down to the host linker if [ $VERBOSE == 2 ]; then echo "passing down link args: $ARG" fi LINK_OTHER_ARGS+=( "$ARG" ) continue fi # processs the objects we put aside for OBJ in "${OBJS_TO_PROCESS[@]}" do if [ $VERBOSE == 2 ]; then echo "processing $OBJ" fi # detect whether the objects in the static library contain a .kernel_ir section KERNEL_UNDETECTED=`objdump -t "$OBJ" 2> /dev/null | grep -q "$KERNEL_IR_SECTION"; echo $?` if [[ $KERNEL_UNDETECTED == "0" ]]; then FILE=`basename "$OBJ"` # remove path FILENAME="${FILE%.*}" KERNEL_FILE="$TEMP_DIR/$FILENAME.kernel.bc" HOST_FILE="$TEMP_DIR/$FILENAME.host.o" # extract kernel section objcopy -O binary -j "$KERNEL_IR_SECTION" "$OBJ" "$KERNEL_FILE" ret=$? if [ $ret != 0 ]; then exit $ret fi # extract host section objcopy -R "$KERNEL_IR_SECTION" "$OBJ" "$HOST_FILE" ret=$? if [ $ret != 0 ]; then exit $ret fi # strip all symbols specified in symbol.txt from $HOST_FILE objcopy @$CXXAMP_SERIALIZE_SYMBOL_FILE "$HOST_FILE" "$HOST_FILE.new" 2> /dev/null if [ -f "$HOST_FILE.new" ]; then mv "$HOST_FILE.new" "$HOST_FILE" fi # find cxxamp_serialize symbols and save them into symbol.txt objdump -t "$HOST_FILE" -j .text 2> /dev/null | grep "g.*__cxxamp_serialize" | awk '{print "-L"$6}' >> $CXXAMP_SERIALIZE_SYMBOL_FILE ret=$? if [ $ret != 0 ]; then exit $ret fi # if the kernel file is empty, just throw it away KERNEL_FILE_SIZE=$(wc -c < "$KERNEL_FILE") if [ "$KERNEL_FILE_SIZE" -ne "0" ]; then LINK_KERNEL_ARGS+=( "$KERNEL_FILE" ) fi LINK_HOST_ARGS+=( "$HOST_FILE" ) else LINK_OTHER_ARGS+=( "$OBJ" ) fi done # for OBJ in "${OBJS_TO_PROCESS[@]}" done # AMDGPU_TARGET_ARRAY could be overridden by HCC_AMDGPU_TARGET env variable if [ -n "$HCC_AMDGPU_TARGET" ]; then # tokenize HCC_AMDGPU_TARGET IFS=',' HCC_AMDGPU_TARGET_TOKENS=($HCC_AMDGPU_TARGET) unset IFS # parse individual token, override AMDGPU_TARGET_ARRAY iter=0 for HCC_AMDGPU_TARGET_TOKEN in ${HCC_AMDGPU_TARGET_TOKENS[@]}; do AMDGPU_TARGET_ARRAY[$iter]=$HCC_AMDGPU_TARGET_TOKEN ((iter++)) done fi # In case neither Clang Driver nor HCC_AMDGPU_TARGET env var specify target, # use default value in CMake configuration if [ ${#AMDGPU_TARGET_ARRAY[@]} == 0 ]; then AMDGPU_TARGET_ARRAY=(@AMDGPU_TARGET@) fi if [ $VERBOSE != 0 ]; then echo "AMDGPU target array: "${AMDGPU_TARGET_ARRAY[@]} echo "" fi if [ $VERBOSE != 0 ]; then echo "new kernel args: ${LINK_KERNEL_ARGS[@]}" echo "" echo "new host args: ${LINK_HOST_ARGS[@]}" echo "" echo "new other args: ${LINK_OTHER_ARGS[@]}" echo "" fi # linker return value ret=0 # touch an empty object for host part, to accomodate rule required by # clang-offload-bundler touch $TEMP_DIR/__empty.o # invoke clang-offload-bundler to create kernel bundle CLANG_OFFLOAD_BUNDLER_INPUTS_ARGS="-inputs=$TEMP_DIR/__empty.o" CLANG_OFFLOAD_BUNDLER_TARGETS_ARGS="-targets=host-@CMAKE_SYSTEM_PROCESSOR@-unknown-linux" # only do kernel lowering if there are objects given if [ ${#LINK_KERNEL_ARGS[@]} != 0 ]; then if [ $KMTHINLTO == "1" ]; then _thinlto_path else _default_path fi if [ $VERBOSE != 0 ]; then echo "Finished generation of AMD GCN kernels" fi fi # invoke clang-offload-bundler $CLANG_OFFLOAD_BUNDLER -type=o $CLANG_OFFLOAD_BUNDLER_INPUTS_ARGS $CLANG_OFFLOAD_BUNDLER_TARGETS_ARGS -outputs=$TEMP_DIR/kernel.bundle # error handling ret=$? if [ $ret != 0 ]; then exit $ret fi if [ $KMDUMPBUNDLE == "1" ]; then cp $TEMP_DIR/kernel.bundle ./dump.bundle fi # build a new kernel object pushd . > /dev/null cd $TEMP_DIR $CLAMP_EMBED kernel.bundle kernel_hsa.o "$KERNEL_SECTION" popd > /dev/null # link everything together ld --allow-multiple-definition $TEMP_DIR/kernel_hsa.o "${LINK_HOST_ARGS[@]}" "${LINK_CPU_ARG[@]}" "${LINK_OTHER_ARGS[@]}" ret=$? # remove temp files if [ -e $TEMP_DIR/kernel_hsa.o ]; then rm $TEMP_DIR/kernel_hsa.o fi if [ -e $TEMP_DIR/kernel_cpu.o ]; then rm $TEMP_DIR/kernel_cpu.o fi if [ -e $TEMP_DIR/__empty.o ]; then rm $TEMP_DIR/__empty.o fi if [ -e $TEMP_DIR/kernel.bundle ]; then rm $TEMP_DIR/kernel.bundle fi rm -f $TEMP_DIR/kernel-*.hsaco if [ -e $TEMP_DIR/combined.bc ]; then rm $TEMP_DIR/combined.bc fi if [ -e $TEMP_DIR/kernel.bc ]; then rm $TEMP_DIR/kernel.bc fi if [ ${#LINK_KERNEL_ARGS[@]} != 0 ]; then rm -f "${LINK_KERNEL_ARGS[@]}" # individual kernels fi if [ -n "$LINK_HOST_ARGS" ]; then rm -f "${LINK_HOST_ARGS[@]}" # individual host codes fi if [ -e $CXXAMP_SERIALIZE_SYMBOL_FILE ]; then rm $CXXAMP_SERIALIZE_SYMBOL_FILE # __cxxamp_serialize symbols fi for TD in "${TEMP_AR_DIRS[@]}" do rm -rf $TD done if [ -d $TEMP_DIR ]; then rm -f $TEMP_DIR/* rmdir $TEMP_DIR fi # return value exit $ret