From 61d7578c895ddbba4d51858dcd1d2abeb6ad3c3c Mon Sep 17 00:00:00 2001
From: Samuel Laferriere <samlaf92@gmail.com>
Date: Fri, 8 Nov 2024 01:17:40 +0400
Subject: [PATCH] test(altda): add test for altda->ethda failover

---
 op-alt-da/damock.go                    | 17 +++++-
 op-e2e/e2eutils/geth/wait.go           | 26 +++++++++
 op-e2e/e2eutils/transactions/count.go  | 18 ++++++-
 op-e2e/system/altda/concurrent_test.go |  2 +-
 op-e2e/system/altda/failover_test.go   | 74 ++++++++++++++++++++++++++
 op-e2e/system/da/multi_test.go         |  2 +-
 op-e2e/system/e2esys/setup.go          | 34 +++++++-----
 7 files changed, 155 insertions(+), 18 deletions(-)
 create mode 100644 op-e2e/system/altda/failover_test.go

diff --git a/op-alt-da/damock.go b/op-alt-da/damock.go
index ad388d0b26535..8d2f918968bc5 100644
--- a/op-alt-da/damock.go
+++ b/op-alt-da/damock.go
@@ -105,12 +105,16 @@ func (d *AltDADisabled) AdvanceL1Origin(ctx context.Context, l1 L1Fetcher, block
 }
 
 // FakeDAServer is a fake DA server for e2e tests.
-// It is a small wrapper around DAServer that allows for setting request latencies,
-// to mimic a DA service with slow responses (eg. eigenDA with 10 min batching interval).
+// It is a small wrapper around DAServer that allows for setting:
+//   - request latencies, to mimic a DA service with slow responses
+//     (eg. eigenDA with 10 min batching interval).
+//   - response status codes, to mimic a DA service that is down.
 type FakeDAServer struct {
 	*DAServer
 	putRequestLatency time.Duration
 	getRequestLatency time.Duration
+	// next failoverCount Put requests will return 503 status code for failover testing
+	failoverCount uint64
 }
 
 func NewFakeDAServer(host string, port int, log log.Logger) *FakeDAServer {
@@ -130,6 +134,10 @@ func (s *FakeDAServer) HandleGet(w http.ResponseWriter, r *http.Request) {
 
 func (s *FakeDAServer) HandlePut(w http.ResponseWriter, r *http.Request) {
 	time.Sleep(s.putRequestLatency)
+	if s.failoverCount > 0 {
+		w.WriteHeader(http.StatusServiceUnavailable)
+		s.failoverCount--
+	}
 	s.DAServer.HandlePut(w, r)
 }
 
@@ -154,6 +162,11 @@ func (s *FakeDAServer) SetGetRequestLatency(latency time.Duration) {
 	s.getRequestLatency = latency
 }
 
+// SetResponseStatusForNRequests sets the next n Put requests to return 503 status code.
+func (s *FakeDAServer) SetPutFailoverForNRequests(n uint64) {
+	s.failoverCount = uint64(n)
+}
+
 type MemStore struct {
 	db   map[string][]byte
 	lock sync.RWMutex
diff --git a/op-e2e/e2eutils/geth/wait.go b/op-e2e/e2eutils/geth/wait.go
index 8356058afda75..17c6f16226ca7 100644
--- a/op-e2e/e2eutils/geth/wait.go
+++ b/op-e2e/e2eutils/geth/wait.go
@@ -8,6 +8,7 @@ import (
 	"strings"
 	"time"
 
+	"github.com/ethereum-optimism/optimism/op-e2e/e2eutils/transactions"
 	"github.com/ethereum-optimism/optimism/op-node/rollup"
 	"github.com/ethereum-optimism/optimism/op-node/rollup/derive"
 	"github.com/ethereum/go-ethereum"
@@ -86,6 +87,31 @@ func WaitForTransaction(hash common.Hash, client *ethclient.Client, timeout time
 	}
 }
 
+// WaitForBlockWithTxFromSender waits for a block with a transaction from a specific sender address.
+// It starts from the current block and checks the next nBlocks blocks.
+func WaitForBlockWithTxFromSender(sender common.Address, client *ethclient.Client, nBlocks uint64) (*types.Block, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+	blockNum, err := client.BlockNumber(ctx)
+	if err != nil {
+		return nil, err
+	}
+	for blockNum := blockNum; blockNum < blockNum+nBlocks; blockNum++ {
+		blockL1, err := WaitForBlock(big.NewInt(0).SetUint64(blockNum), client)
+		if err != nil {
+			return nil, err
+		}
+		batcherTxCount, err := transactions.TransactionsBySenderCount(blockL1, sender)
+		if err != nil {
+			return nil, err
+		}
+		if batcherTxCount > 0 {
+			return blockL1, nil
+		}
+	}
+	return nil, fmt.Errorf("no block with tx from sender %s found in the last %d blocks", sender.Hex(), nBlocks)
+}
+
 type waitForBlockOptions struct {
 	noChangeTimeout time.Duration
 	absoluteTimeout time.Duration
diff --git a/op-e2e/e2eutils/transactions/count.go b/op-e2e/e2eutils/transactions/count.go
index 0f4d41fe04786..7f9f05c2857f5 100644
--- a/op-e2e/e2eutils/transactions/count.go
+++ b/op-e2e/e2eutils/transactions/count.go
@@ -5,7 +5,8 @@ import (
 	"github.com/ethereum/go-ethereum/core/types"
 )
 
-func TransactionsBySender(block *types.Block, sender common.Address) (int64, error) {
+// TransactionsBySenderCount returns the number of transactions in the block that were sent by the given sender.
+func TransactionsBySenderCount(block *types.Block, sender common.Address) (int64, error) {
 	txCount := int64(0)
 	for _, tx := range block.Transactions() {
 		signer := types.NewCancunSigner(tx.ChainId())
@@ -19,3 +20,18 @@ func TransactionsBySender(block *types.Block, sender common.Address) (int64, err
 	}
 	return txCount, nil
 }
+
+func TransactionsBySender(block *types.Block, sender common.Address) ([]*types.Transaction, error) {
+	txs := make([]*types.Transaction, 0)
+	for _, tx := range block.Transactions() {
+		signer := types.NewCancunSigner(tx.ChainId())
+		txSender, err := types.Sender(signer, tx)
+		if err != nil {
+			return nil, err
+		}
+		if txSender == sender {
+			txs = append(txs, tx)
+		}
+	}
+	return txs, nil
+}
diff --git a/op-e2e/system/altda/concurrent_test.go b/op-e2e/system/altda/concurrent_test.go
index 32506e5c4a102..cd6dcce911ede 100644
--- a/op-e2e/system/altda/concurrent_test.go
+++ b/op-e2e/system/altda/concurrent_test.go
@@ -63,7 +63,7 @@ func TestBatcherConcurrentAltDARequests(t *testing.T) {
 		require.NoError(t, err, "Waiting for l1 blocks")
 		// there are possibly other services (proposer/challenger) in the background sending txs
 		// so we only count the batcher txs
-		batcherTxCount, err := transactions.TransactionsBySender(block, cfg.DeployConfig.BatchSenderAddress)
+		batcherTxCount, err := transactions.TransactionsBySenderCount(block, cfg.DeployConfig.BatchSenderAddress)
 		require.NoError(t, err)
 		if batcherTxCount > 1 {
 			return
diff --git a/op-e2e/system/altda/failover_test.go b/op-e2e/system/altda/failover_test.go
new file mode 100644
index 0000000000000..9ca522c8345e6
--- /dev/null
+++ b/op-e2e/system/altda/failover_test.go
@@ -0,0 +1,74 @@
+package altda
+
+import (
+	"math/big"
+	"testing"
+
+	op_e2e "github.com/ethereum-optimism/optimism/op-e2e"
+	"github.com/ethereum-optimism/optimism/op-node/rollup/derive"
+	"github.com/ethereum/go-ethereum/log"
+
+	"github.com/ethereum-optimism/optimism/op-batcher/flags"
+	"github.com/ethereum-optimism/optimism/op-e2e/e2eutils/geth"
+	"github.com/ethereum-optimism/optimism/op-e2e/e2eutils/transactions"
+	"github.com/ethereum-optimism/optimism/op-e2e/system/e2esys"
+	"github.com/stretchr/testify/require"
+)
+
+// TestBatcher_FailoverToEthDA_FallbackToAltDA tests that the batcher will failover to ethDA
+// if the da-server returns 503, and then fallback to altDA once altDA is available again
+// (i.e. the da-server doesn't return 503 anymore).
+func TestBatcher_FailoverToEthDA_FallbackToAltDA(t *testing.T) {
+	op_e2e.InitParallel(t)
+
+	nChannelsFailover := uint64(2)
+
+	cfg := e2esys.DefaultSystemConfig(t, e2esys.WithLogLevel(log.LevelCrit))
+	cfg.DeployConfig.UseAltDA = true
+	// With these settings, the batcher will post a single commitment per L1 block,
+	// so it's easy to trigger failover and observe the commitment changing on the next L1 block.
+	cfg.BatcherMaxPendingTransactions = 1 // no limit on parallel txs
+	cfg.BatcherMaxConcurrentDARequest = 1
+	cfg.BatcherBatchType = 0
+	// We make channels as small as possible, such that they contain a single commitment.
+	// This is because failover to ethDA happens on a per-channel basis (each new channel is sent to altDA first).
+	// Hence, we can quickly observe the failover (to ethda) and fallback (to altda) behavior.
+	// cfg.BatcherMaxL1TxSizeBytes = 1200
+	// currently altda commitments can only be sent as calldata
+	cfg.DataAvailabilityType = flags.CalldataType
+
+	sys, err := cfg.Start(t)
+	require.NoError(t, err, "Error starting up system")
+	defer sys.Close()
+	l1Client := sys.NodeClient("l1")
+
+	startBlockL1, err := geth.WaitForBlockWithTxFromSender(cfg.DeployConfig.BatchSenderAddress, l1Client, 10)
+	require.NoError(t, err)
+
+	// Simulate altda server returning 503
+	sys.FakeAltDAServer.SetPutFailoverForNRequests(nChannelsFailover)
+
+	countEthDACommitment := uint64(0)
+
+	// Most likely, sequence of blocks will be: altDA, ethDA, ethDA, altDA, altDA, altDA.
+	for blockNumL1 := startBlockL1.NumberU64(); blockNumL1 < startBlockL1.NumberU64()+6; blockNumL1++ {
+		blockL1, err := geth.WaitForBlock(big.NewInt(0).SetUint64(blockNumL1), l1Client)
+		require.NoError(t, err)
+		batcherTxs, err := transactions.TransactionsBySender(blockL1, cfg.DeployConfig.BatchSenderAddress)
+		require.NoError(t, err)
+		require.Equal(t, 1, len(batcherTxs)) // sanity check: ensure BatcherMaxPendingTransactions=1 is working
+		batcherTx := batcherTxs[0]
+		if batcherTx.Data()[0] == 1 {
+			t.Log("blockL1", blockNumL1, "batcherTxType", "altda")
+		} else if batcherTx.Data()[0] == 0 {
+			t.Log("blockL1", blockNumL1, "batcherTxType", "ethda")
+		} else {
+			t.Fatalf("unexpected batcherTxType: %v", batcherTx.Data()[0])
+		}
+		if batcherTx.Data()[0] == byte(derive.DerivationVersion0) {
+			countEthDACommitment++
+		}
+	}
+	require.Equal(t, nChannelsFailover, countEthDACommitment, "Expected %v ethDA commitments, got %v", nChannelsFailover, countEthDACommitment)
+
+}
diff --git a/op-e2e/system/da/multi_test.go b/op-e2e/system/da/multi_test.go
index 3d150010a0f38..da503f33b16b3 100644
--- a/op-e2e/system/da/multi_test.go
+++ b/op-e2e/system/da/multi_test.go
@@ -51,7 +51,7 @@ func TestBatcherMultiTx(t *testing.T) {
 		require.NoError(t, err, "Waiting for l1 blocks")
 		// there are possibly other services (proposer/challenger) in the background sending txs
 		// so we only count the batcher txs
-		batcherTxCount, err := transactions.TransactionsBySender(block, cfg.DeployConfig.BatchSenderAddress)
+		batcherTxCount, err := transactions.TransactionsBySenderCount(block, cfg.DeployConfig.BatchSenderAddress)
 		require.NoError(t, err)
 		totalBatcherTxsCount += int64(batcherTxCount)
 
diff --git a/op-e2e/system/e2esys/setup.go b/op-e2e/system/e2esys/setup.go
index 5d046b3c649c3..1f368cb49b29f 100644
--- a/op-e2e/system/e2esys/setup.go
+++ b/op-e2e/system/e2esys/setup.go
@@ -6,6 +6,7 @@ import (
 	"crypto/rand"
 	"errors"
 	"fmt"
+	"log/slog"
 	"math/big"
 	"net"
 	"os"
@@ -85,6 +86,7 @@ var (
 
 type SystemConfigOpts struct {
 	AllocType config.AllocType
+	LogLevel  slog.Level
 }
 
 type SystemConfigOpt func(s *SystemConfigOpts)
@@ -95,9 +97,16 @@ func WithAllocType(allocType config.AllocType) SystemConfigOpt {
 	}
 }
 
+func WithLogLevel(level slog.Level) SystemConfigOpt {
+	return func(s *SystemConfigOpts) {
+		s.LogLevel = level
+	}
+}
+
 func DefaultSystemConfig(t testing.TB, opts ...SystemConfigOpt) SystemConfig {
 	sco := &SystemConfigOpts{
 		AllocType: config.DefaultAllocType,
+		LogLevel:  slog.LevelInfo,
 	}
 	for _, opt := range opts {
 		opt(sco)
@@ -108,7 +117,7 @@ func DefaultSystemConfig(t testing.TB, opts ...SystemConfigOpt) SystemConfig {
 	deployConfig := config.DeployConfig(sco.AllocType)
 	deployConfig.L1GenesisBlockTimestamp = hexutil.Uint64(time.Now().Unix())
 	e2eutils.ApplyDeployConfigForks(deployConfig)
-	require.NoError(t, deployConfig.Check(testlog.Logger(t, log.LevelInfo)),
+	require.NoError(t, deployConfig.Check(testlog.Logger(t, sco.LogLevel).New("role", "config-check")),
 		"Deploy config is invalid, do you need to run make devnet-allocs?")
 	l1Deployments := config.L1Deployments(sco.AllocType)
 	require.NoError(t, l1Deployments.Check(deployConfig))
@@ -170,11 +179,12 @@ func DefaultSystemConfig(t testing.TB, opts ...SystemConfigOpt) SystemConfig {
 			},
 		},
 		Loggers: map[string]log.Logger{
-			RoleVerif:   testlog.Logger(t, log.LevelInfo).New("role", RoleVerif),
-			RoleSeq:     testlog.Logger(t, log.LevelInfo).New("role", RoleSeq),
-			"batcher":   testlog.Logger(t, log.LevelInfo).New("role", "batcher"),
-			"proposer":  testlog.Logger(t, log.LevelInfo).New("role", "proposer"),
-			"da-server": testlog.Logger(t, log.LevelInfo).New("role", "da-server"),
+			RoleVerif:      testlog.Logger(t, sco.LogLevel).New("role", RoleVerif),
+			RoleSeq:        testlog.Logger(t, sco.LogLevel).New("role", RoleSeq),
+			"batcher":      testlog.Logger(t, sco.LogLevel).New("role", "batcher"),
+			"proposer":     testlog.Logger(t, sco.LogLevel).New("role", "proposer"),
+			"da-server":    testlog.Logger(t, sco.LogLevel).New("role", "da-server"),
+			"config-check": testlog.Logger(t, sco.LogLevel).New("role", "config-check"),
 		},
 		GethOptions:                   map[string][]geth.GethOption{},
 		P2PTopology:                   nil, // no P2P connectivity by default
@@ -265,12 +275,10 @@ type SystemConfig struct {
 	// L1FinalizedDistance is the distance from the L1 head that L1 blocks will be artificially finalized on.
 	L1FinalizedDistance uint64
 
-	Premine        map[common.Address]*big.Int
-	Nodes          map[string]*rollupNode.Config // Per node config. Don't use populate rollup.Config
-	Loggers        map[string]log.Logger
-	GethOptions    map[string][]geth.GethOption
-	ProposerLogger log.Logger
-	BatcherLogger  log.Logger
+	Premine     map[common.Address]*big.Int
+	Nodes       map[string]*rollupNode.Config // Per node config. Don't use populate rollup.Config
+	Loggers     map[string]log.Logger
+	GethOptions map[string][]geth.GethOption
 
 	ExternalL2Shim string
 
@@ -519,7 +527,7 @@ func (cfg SystemConfig) Start(t *testing.T, startOpts ...StartOption) (*System,
 		c = sys.TimeTravelClock
 	}
 
-	if err := cfg.DeployConfig.Check(testlog.Logger(t, log.LevelInfo)); err != nil {
+	if err := cfg.DeployConfig.Check(cfg.Loggers["config-check"]); err != nil {
 		return nil, err
 	}