Skip to content

Fix TransformerEngine and Activation checkpointing interaction for thunderFX #5033

Fix TransformerEngine and Activation checkpointing interaction for thunderFX

Fix TransformerEngine and Activation checkpointing interaction for thunderFX #5033

Workflow file for this run

name: CI testing
# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows
on: # Trigger the workflow on push or pull request, but only for the main branch
push:
branches: [main]
pull_request: {}
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
defaults:
run:
shell: bash
env:
#CI: "true"
TORCH_URL_RC: "https://download.pytorch.org/whl/test/cpu/torch_test.html"
TORCH_URL_NIGHTLY: "https://download.pytorch.org/whl/nightly/cpu"
TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu"
jobs:
pytester:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: ["ubuntu-22.04", "macOS-14", "windows-2022"]
python-version: ["3.10"]
requires: ["latest", "nightly"] # , 'oldest'
suite: ["core", "ops"]
include:
- { os: "ubuntu-22.04", python-version: "3.11", requires: "latest" }
- { os: "ubuntu-22.04", python-version: "3.12", requires: "latest" }
exclude:
- { os: "windows-2022", suite: "ops" }
# Timeout: https://stackoverflow.com/a/59076067/4521646
timeout-minutes: 35
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Setup Ubuntu
if: runner.os == 'ubuntu'
run: |
sudo apt-get install -y graphviz
- name: Set min. dependencies
if: matrix.requires == 'oldest'
run: |
for fpath in ('requirements/base.txt', 'requirements/test.txt'):
req = open(fpath).read().replace('>=', '==')
open(fpath, 'w').write(req)
shell: python
- name: Get pip cache dir
id: pip-cache
run: echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT
- name: Cache pip
uses: actions/cache@v4
with:
path: ${{ steps.pip-cache.outputs.dir }}
key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}-pip-${{ hashFiles('requirements/*.txt') }}
restore-keys: |
${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}-pip-
- name: switch Torch source
run: |
if [[ "${{ matrix.requires }}" == "nightly" ]]; then
echo "TORCH_URL=$TORCH_URL_NIGHTLY" >> $GITHUB_ENV
echo "PIP_EXTRA_FLAG=--pre" >> $GITHUB_ENV
else
echo "TORCH_URL=$TORCH_URL_STABLE" >> $GITHUB_ENV
fi
- name: Install package & dependencies
run: |
if [[ "${{ runner.os }}" == "macOS" ]]; then
brew install libomp
fi
pip --version
pip install -U \
${PIP_EXTRA_FLAG} torch torchvision torchaudio \
--index-url=${TORCH_URL}
pip install -e . -U \
${PIP_EXTRA_FLAG} -r requirements/test.txt
pip list
shell: bash
- name: Testing Local
if: matrix.python-version == '3.10' && matrix.suite == 'core'
run: |
coverage run --source thunder -m \
pytest thunder/tests/ \
--ignore=thunder/tests/distributed \
--ignore=thunder/tests/test_ops.py \
--ignore=thunder/tests/test_grad.py \
-v --datefmt="%Y%m%d-%H:%M:%S.%f" \
--random-order-seed=$GITHUB_RUN_ID \
-n 4 --durations=250
- name: Testing Distributed
# run all found tests in given past as standalone
if: matrix.python-version == '3.10' && runner.os == 'Linux' && matrix.suite == 'core'
run: |
pytest thunder/tests/distributed/ \
-v --datefmt="%Y%m%d-%H:%M:%S.%f" \
--random-order-seed=$GITHUB_RUN_ID \
--durations=250
- name: Testing Ops
if: matrix.python-version == '3.10' && matrix.suite == 'ops' && runner.os != 'Windows'
run: |
coverage run --source thunder -m \
pytest thunder/tests/test_ops.py thunder/tests/test_grad.py \
-v --datefmt="%Y%m%d-%H:%M:%S.%f" \
--random-order-seed=$GITHUB_RUN_ID \
-n 4 --durations=250
- name: Testing interpreter
if: matrix.python-version == '3.11' || matrix.python-version == '3.12'
#continue-on-error: true
run: |
python -m pytest \
thunder/tests/test_interpreter.py \
-v --durations=50 --cov=thunder
python -m pytest thunder/tests/test_jit_general.py -v --durations=50 --cov=thunder
- name: Statistics
run: |
coverage report
coverage xml
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: ./coverage.xml
flags: unittests
env_vars: OS,PYTHON
name: codecov-umbrella
fail_ci_if_error: false
testing-guardian:
runs-on: ubuntu-latest
needs: pytester
if: always()
steps:
- run: echo "${{ needs.pytester.result }}"
- name: failing...
if: needs.pytester.result == 'failure'
run: exit 1
- name: cancelled or skipped...
if: contains(fromJSON('["cancelled", "skipped"]'), needs.pytester.result)
timeout-minutes: 1
run: sleep 90