Skip to content

Commit

Permalink
fix: improve stalled retrieval cancellation (#1449)
Browse files Browse the repository at this point in the history
* refactor stalled retrieval cancel

* add ctx with timeout

* implement suggestions

* update err wrapping

* fix: set short cancel timeout for unpaid retrievals only

---------

Co-authored-by: Dirk McCormick <[email protected]>
  • Loading branch information
LexLuthr and dirkmc authored May 23, 2023
1 parent fd35f62 commit 24a9bb3
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 15 deletions.
2 changes: 1 addition & 1 deletion node/config/def.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ func DefaultBoost() *Boost {

DealProposalLogDuration: Duration(time.Hour * 24),
RetrievalLogDuration: Duration(time.Hour * 24),
StalledRetrievalTimeout: Duration(time.Minute * 30),
StalledRetrievalTimeout: Duration(time.Second * 30),

RetrievalPricing: &lotus_config.RetrievalPricing{
Strategy: RetrievalPricingDefaultMode,
Expand Down
41 changes: 28 additions & 13 deletions retrievalmarket/rtvllog/retrieval_log.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package rtvllog

import (
"context"
"errors"
"sync"
"time"

Expand Down Expand Up @@ -295,22 +296,36 @@ func (r *RetrievalLog) gcRetrievals(ctx context.Context) {
continue
}

var wg sync.WaitGroup
for _, row := range rows {
chid := datatransfer.ChannelID{Initiator: row.PeerID, Responder: row.LocalPeerID, ID: row.TransferID}
// Try to cancel via unpaid graphsync first
err := r.gsur.CancelTransfer(ctx, row.TransferID, &row.PeerID)

if err != nil {
// Attempt to terminate legacy, paid retrievals if we didnt cancel a free retrieval
err = r.dataTransfer.CloseDataTransferChannel(ctx, chid)
}

if err != nil {
log.Debugw("error canceling retrieval", "dealID", row.DealID, "err", err)
} else {
log.Infof("Canceled retrieval %s, older than %s", row.DealID, r.stalledTimeout)
if row.TransferID <= 0 {
continue
}
wg.Add(1)
go func(s RetrievalDealState) {
// Don't wait for more than 5 seconds for the cancel
// message to be sent when cancelling an unpaid retrieval
unpaidRtrvCtx, cancel := context.WithTimeout(ctx, time.Second*5)
defer cancel()
defer wg.Done()

// Try to cancel an unpaid retrieval with the given transfer id first
err := r.gsur.CancelTransfer(unpaidRtrvCtx, s.TransferID, &s.PeerID)
if err != nil && errors.Is(err, server.ErrRetrievalNotFound) {
// Couldn't find an unpaid retrieval with that id, try
// to cancel a legacy, paid retrieval
chid := datatransfer.ChannelID{Initiator: s.PeerID, Responder: s.LocalPeerID, ID: s.TransferID}
err = r.dataTransfer.CloseDataTransferChannel(ctx, chid)
}

if err != nil {
log.Debugw("error canceling retrieval", "dealID", s.DealID, "err", err)
} else {
log.Infof("Canceled retrieval %s, older than %s", s.DealID, r.stalledTimeout)
}
}(row)
}
wg.Wait()
}
}
}
Expand Down
3 changes: 2 additions & 1 deletion retrievalmarket/server/gsunpaidretrieval.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
)

var log = logging.Logger("boostgs")
var ErrRetrievalNotFound = fmt.Errorf("no transfer found")

var incomingReqExtensions = []graphsync.ExtensionName{
extension.ExtensionIncomingRequest1_1,
Expand Down Expand Up @@ -175,7 +176,7 @@ func (g *GraphsyncUnpaidRetrieval) CancelTransfer(ctx context.Context, id datatr

if state == nil {
g.activeRetrievalsLk.Unlock()
return fmt.Errorf("no transfer with id %d", id)
return fmt.Errorf("failed to cancel with id %d: %w", id, ErrRetrievalNotFound)
}

rcpt := state.cs.recipient
Expand Down

0 comments on commit 24a9bb3

Please sign in to comment.