-
Notifications
You must be signed in to change notification settings - Fork 24
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(restartStuckPod):Restart stuck pods #413
Changes from all commits
a49a7a2
8aaae51
ba39b77
350f618
eb36aaf
9540577
2f15ed8
c37db22
84f9882
c4eb216
d6e81d3
5b1d880
c9349d4
9425025
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
package fullnode | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"io" | ||
"strings" | ||
"time" | ||
|
||
cosmosv1 "github.com/strangelove-ventures/cosmos-operator/api/v1" | ||
"github.com/strangelove-ventures/cosmos-operator/internal/kube" | ||
corev1 "k8s.io/api/core/v1" | ||
"k8s.io/apimachinery/pkg/util/intstr" | ||
"k8s.io/client-go/kubernetes" | ||
"k8s.io/client-go/rest" | ||
"sigs.k8s.io/controller-runtime/pkg/client" | ||
) | ||
|
||
type StuckPodDetection struct { | ||
available func(pods []*corev1.Pod, minReady time.Duration, now time.Time) []*corev1.Pod | ||
collector StatusCollector | ||
computeRollout func(maxUnavail *intstr.IntOrString, desired, ready int) int | ||
} | ||
|
||
func NewStuckDetection(collector StatusCollector) StuckPodDetection { | ||
return StuckPodDetection{ | ||
available: kube.AvailablePods, | ||
collector: collector, | ||
computeRollout: kube.ComputeRollout, | ||
} | ||
} | ||
|
||
// StuckPods returns pods that are stuck on a block height due to a cometbft issue that manifests on sentries using horcrux. | ||
func (d StuckPodDetection) StuckPods(ctx context.Context, crd *cosmosv1.CosmosFullNode) []*corev1.Pod { | ||
pods := d.collector.Collect(ctx, client.ObjectKeyFromObject(crd)).Synced().Pods() | ||
|
||
for i, pod := range pods { | ||
config, err := rest.InClusterConfig() | ||
if err != nil { | ||
panic(err.Error()) | ||
} | ||
|
||
clientset, err := kubernetes.NewForConfig(config) | ||
if err != nil { | ||
panic(err.Error()) | ||
} | ||
|
||
receivedString := getPodLogsLastLine(clientset, pod) | ||
fmt.Println(receivedString) | ||
podIsStuck := isPodStuck(receivedString) | ||
|
||
//MORE TODO HERE | ||
if podIsStuck { | ||
pods = removeElement(pods, i) | ||
} | ||
} | ||
return pods | ||
} | ||
|
||
func isPodStuck(receivedString string) bool { | ||
if strings.Contains(receivedString, "SignerListener: Connected") { | ||
timeInLog, err := extractTimeFromLog(receivedString) | ||
if err != nil { | ||
fmt.Println("Error parsing time from log:", err) | ||
return true | ||
} | ||
|
||
currentTime := time.Now().UTC() | ||
|
||
logTimeToday := time.Date(currentTime.Year(), currentTime.Month(), currentTime.Day(), | ||
timeInLog.Hour(), timeInLog.Minute(), timeInLog.Second(), timeInLog.Nanosecond(), currentTime.Location()) | ||
|
||
timeDiff := currentTime.Sub(logTimeToday) | ||
|
||
if timeDiff >= time.Minute { | ||
return true | ||
} | ||
} | ||
|
||
return false | ||
} | ||
|
||
func extractTimeFromLog(log string) (time.Time, error) { | ||
parts := strings.Fields(log) | ||
|
||
const timeLayout = "3:04PM" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We may need to run the time parse on a few layout strings, for example I am looking at an Axelar Testnet sentry node running and it has this time string:
|
||
parsedTime, err := time.Parse(timeLayout, parts[0]) | ||
if err != nil { | ||
return time.Time{}, err | ||
} | ||
|
||
return parsedTime, nil | ||
} | ||
|
||
func getPodLogsLastLine(clientset *kubernetes.Clientset, pod *corev1.Pod) string { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We may also want to change this to return |
||
podLogOpts := corev1.PodLogOptions{} | ||
logRequest := clientset.CoreV1().Pods(pod.Namespace).GetLogs(pod.Name, &podLogOpts) | ||
|
||
logStream, err := logRequest.Stream(context.Background()) | ||
if err != nil { | ||
fmt.Printf("Error getting logs for pod %s: %v\n", pod.Name, err) | ||
return "" | ||
} | ||
defer logStream.Close() | ||
|
||
logBytes, err := io.ReadAll(logStream) | ||
if err != nil { | ||
fmt.Printf("Error reading logs for pod %s: %v\n", pod.Name, err) | ||
return "" | ||
} | ||
|
||
logLines := strings.Split(strings.TrimRight(string(logBytes), "\n"), "\n") | ||
if len(logLines) > 0 { | ||
return logLines[len(logLines)-1] | ||
} | ||
return "" | ||
} | ||
|
||
func removeElement(slice []*corev1.Pod, index int) []*corev1.Pod { | ||
return append(slice[:index], slice[index+1:]...) | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We may not want to return
true
here, if the time parse fails this will return true from isPodStuck, which I assume would kill the pod inadvertantly.Can we change this function to return
(bool, error)
so we can track the errors better?