Skip to content

Commit

Permalink
fix: Optimize CI fails sporadically.
Browse files Browse the repository at this point in the history
Signed-off-by: tao.yang <[email protected]>
  • Loading branch information
ty-dc committed Oct 22, 2024
1 parent 9fe62ba commit a7a4272
Show file tree
Hide file tree
Showing 26 changed files with 1,816 additions and 153 deletions.
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ require github.com/google/go-cmp v0.6.0 // indirect
require k8s.io/component-base v0.29.4 // indirect

require (
github.com/hashicorp/go-multierror v1.1.1
go.uber.org/automaxprocs v1.5.3
k8s.io/kubectl v0.26.3
)
Expand Down Expand Up @@ -113,6 +114,7 @@ require (
github.com/google/uuid v1.3.0 // indirect
github.com/gorilla/handlers v1.5.1 // indirect
github.com/grafana/pyroscope-go/godeltaprof v0.1.3 // indirect
github.com/hashicorp/errwrap v1.1.0 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
github.com/huandu/xstrings v1.3.3 // indirect
github.com/imdario/mergo v0.3.13 // indirect
Expand Down
5 changes: 5 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,11 @@ github.com/grafana/pyroscope-go v1.0.2/go.mod h1:bShDKsVZdzxq+Ol6no0JKigU9y5FTWU
github.com/grafana/pyroscope-go/godeltaprof v0.1.3 h1:eunWpv1B3Z7ZK9o4499EmQGlY+CsDmSZ4FbxjRx37uk=
github.com/grafana/pyroscope-go/godeltaprof v0.1.3/go.mod h1:1HSPtjU8vLG0jE9JrTdzjgFqdJ/VgN7fvxBNq3luJko=
github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I=
github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
Expand Down
35 changes: 21 additions & 14 deletions test/doc/coordinator.md
Original file line number Diff line number Diff line change
@@ -1,18 +1,25 @@
# E2E Cases for coordinator

| Case ID | Title | Priority | Smoke | Status | Other |
| ------- | ------------------------------------------------------------ | -------- | ----- | ------ | ----- |
| C00001 | coordinator in tuneMode: underlay works well | p1 | smoke | done | |
| C00002 | coordinator in tuneMode: overlay works well | p1 | smoke | done | |
| C00003 | coordinator in tuneMode: underlay with two NIC | p1 | smoke | | |
| C00004 | coordinator in tuneMode: overlay with two NIC | p1 | smoke | | |
| C00005 | In overlay mode: specify the NIC (eth0) where the default route is located, use 'ip r get 8.8.8.8' to see if default route nic is the specify NIC | p2 | | done | |
| C00006 | In underlay mode: specify the NIC (eth0) where the default route is located, use 'ip r get 8.8.8.8' to see if default route nic is the specify NIC | p2 | | | |
| C00007 | ip conflict detection (ipv4, ipv6) | p2 | | done | |
| C00008 | override pod mac prefix | p2 | | done | |
| C00009 | gateway connection detection | p2 | | done | |
| C00010 | auto clean up the dirty rules(routing\neighborhood) while pod starting | p2 | | |
| C00011 | In the default scenario (Do not specify the NIC where the default route is located in any way) , use 'ip r get 8.8.8.8' to see if default route NIC is `net1` | p2 | | |
| C00012 | In multi-nic case , use 'ip r get <service_subnet> and <hostIP>' to see if src is from pod's eth0, note: only for ipv4. | p2 | | |
| Case ID | Title | Priority | Smoke | Status | Other |
|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|-------|--------|-------|
| C00001 | coordinator in tuneMode: underlay works well | p1 | smoke | done | |
| C00002 | coordinator in tuneMode: overlay works well | p1 | smoke | done | |
| C00003 | coordinator in tuneMode: underlay with two NIC | p1 | smoke | done | |
| C00004 | coordinator in tuneMode: overlay with two NIC | p1 | smoke | done | |
| C00005 | In overlay mode: specify the NIC (net1) where the default route is located, use 'ip r get 8.8.8.8' to see if default route nic is the specify NIC | p2 | | done | |
| C00006 | In underlay mode: specify the NIC (net1) where the default route is located, use 'ip r get 8.8.8.8' to see if default route nic is the specify NIC | p2 | | done | |
| C00007 | ip conflict detection (ipv4, ipv6) | p2 | | done | |
| C00008 | override pod mac prefix | p2 | | done | |
| C00009 | gateway connection detection | p2 | | done | |
| C00010 | auto clean up the dirty rules(routing\neighborhood) while pod starting | p2 | | done | |
| C00011 | In the default scenario (Do not specify the NIC where the default route is located in any way) , use 'ip r get 8.8.8.8' to see if default route NIC is `eth0` | p2 | | done | |
| C00012 | In multi-nic case , use 'ip r get <service_subnet> and <hostIP>' to see if src is from pod's eth0, note: only for ipv4. | p2 | | done | |
| C00013 | Support `spec.externalTrafficPolicy` for service in Local mode, it works well | p2 | | done | |
| C00014 | Specify the NIC of the default route, but the NIC does not exist | p3 | | done | |
| C00015 | In multi-NIC mode, whether the NIC name is random and pods are created normally | p3 | | done | |
| C00016 | The table name can be customized by hostRuleTable | p3 | | done | |
| C00017 | TunePodRoutes If false, no routing will be coordinated | p3 | | done | |
| C00018 | The conflict IPs for stateless Pod should be released | p3 | | done | |
| C00019 | The conflict IPs for stateful Pod should not be released | p3 | | done | |
| C00020 | kdoctor connectivity should be succeed with annotations: ipam.spidernet.io/default-route-nic: net1 | p3 | | done | |
| C00021 | kdoctor connectivity should be succeed with three macvlan interfaces, and set rp_filter to 1 | p3 | | done | |
2 changes: 2 additions & 0 deletions test/e2e/common/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ const (
BatchCreateTimeout = time.Minute * 5
KdoctorCheckTime = time.Minute * 10
SpiderSyncMultusTime = time.Minute * 2
InformerSyncStatusTime = time.Second * 30
KDoctorRunTimeout = time.Minute * 10
)

var ForcedWaitingTime = time.Second
Expand Down
34 changes: 34 additions & 0 deletions test/e2e/common/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"fmt"
"os/exec"

"github.com/hashicorp/go-multierror"
. "github.com/onsi/ginkgo/v2"
e2e "github.com/spidernet-io/e2eframework/framework"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -55,3 +56,36 @@ func RestartNodeUntilClusterReady(ctx context.Context, frame *e2e.Framework, nod
GinkgoWriter.Println("Check that the status of all Pods in the cluster is running")
return nil
}

func GetNodeNetworkInfo(ctx context.Context, frame *e2e.Framework, nodeList []string) error {
var jobResult *multierror.Error
for _, node := range nodeList {
GinkgoWriter.Printf("=============== Check the network information of the node %v ============== \n", node)
commands := []string{
"ip a",
"ip link show",
"ip n",
"ip -6 n",
"ip rule",
"ip -6 rule",
"ip route",
"ip route show table 100",
"ip route show table 101",
"ip route show table 500",
"ip -6 route",
"ip -6 route show table 100",
"ip -6 route show table 101",
"ip -6 route show table 500",
}

for _, command := range commands {
GinkgoWriter.Printf("--------------- execute %v in node: %v ------------ \n", command, node)
out, err := frame.DockerExecCommand(ctx, node, command)
if err != nil {
jobResult = multierror.Append(jobResult, fmt.Errorf("node %v: command '%v' failed with error: %w, output: %s", node, command, err, out))
}
}
}

return jobResult.ErrorOrNil()
}
42 changes: 41 additions & 1 deletion test/e2e/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ import (
"time"

"github.com/spidernet-io/spiderpool/pkg/constant"
"github.com/spidernet-io/spiderpool/pkg/utils/retry"

"github.com/hashicorp/go-multierror"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
e2e "github.com/spidernet-io/e2eframework/framework"
Expand Down Expand Up @@ -48,7 +50,14 @@ func GenerateExamplePodYaml(podName, namespace string) *corev1.Pod {
func CreatePodUntilReady(frame *e2e.Framework, podYaml *corev1.Pod, podName, namespace string, waitPodStartTimeout time.Duration) (pod *corev1.Pod, podIPv4, podIPv6 string) {
// create pod
GinkgoWriter.Printf("create pod %v/%v \n", namespace, podName)
err := frame.CreatePod(podYaml)
err := retry.RetryOnConflictWithContext(context.Background(), retry.DefaultBackoff, func(ctx context.Context) error {
err := frame.CreatePod(podYaml)
if err != nil {
GinkgoLogr.Error(fmt.Errorf("failed to create pod %v/%v, error: %v", namespace, podName, err), "Failed")
return err
}
return nil
})
Expect(err).NotTo(HaveOccurred(), "failed to create pod")

// wait for pod ip
Expand Down Expand Up @@ -138,3 +147,34 @@ func ValidatePodIPConflict(podList *corev1.PodList) error {
}
return nil
}

func GetPodNetworkInfo(ctx context.Context, frame *e2e.Framework, podList *corev1.PodList) error {
var jobResult *multierror.Error
for _, pod := range podList.Items {
GinkgoWriter.Printf("=============== Check the network information of the pod %v/%v ============== \n", pod.Namespace, pod.Name)
commands := []string{
"ip a",
"ip link show",
"ip n",
"ip -6 n",
"ip rule",
"ip -6 rule",
"ip route",
"ip route show table 100",
"ip route show table 101",
"ip -6 route",
"ip -6 route show table 100",
"ip -6 route show table 101",
}

for _, command := range commands {
GinkgoWriter.Printf("--------------- execute %v in pod: %v/%v on node: %v ------------ \n", command, pod.Namespace, pod.Name, pod.Spec.NodeName)
out, err := frame.ExecCommandInPod(pod.Name, pod.Namespace, command, ctx)
if err != nil {
jobResult = multierror.Append(jobResult, fmt.Errorf("pod %v/%v: command '%v' failed with error: %w, output: %s", pod.Namespace, pod.Name, command, err, out))
}
}
}

return jobResult.ErrorOrNil()
}
77 changes: 53 additions & 24 deletions test/e2e/common/spiderpool.go
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ func DeleteIPPoolUntilFinish(f *frame.Framework, poolName string, ctx context.Co
default:
_, err := GetIppoolByName(f, poolName)
if err != nil {
GinkgoWriter.Printf("IPPool '%s' has been removederror: %v", poolName, err)
GinkgoWriter.Printf("IPPool '%s' has been removed, error: %v", poolName, err)
return nil
}
time.Sleep(ForcedWaitingTime)
Expand Down Expand Up @@ -562,7 +562,7 @@ func WaitWorkloadDeleteUntilFinish(ctx context.Context, f *frame.Framework, name
_, err := GetWorkloadByName(f, namespace, name)
if err != nil {
if api_errors.IsNotFound(err) {
GinkgoWriter.Printf("workload '%s/%s' has been removederror: %v", namespace, name, err)
GinkgoWriter.Printf("workload '%s/%s' has been removed, error: %v", namespace, name, err)
return nil
}
return err
Expand Down Expand Up @@ -877,14 +877,15 @@ func CheckIppoolSanity(f *frame.Framework, poolName string) error {
podYaml, err := f.GetPod(podName, podNS)
if err != nil {
if api_errors.IsNotFound(err) {
GinkgoLogr.Error(fmt.Errorf("pod %s/%s does not exist", podNS, podName), "Failed")
GinkgoLogr.Error(fmt.Errorf("the pod %s/%s in ippool %s, but pod does not exist in kubernetes", podNS, podName, poolName), "Failed")
isSanity = false
continue
} else {
return fmt.Errorf("failed to get pod %s/%s, error: %v", podNS, podName, err)
}
}

podNetworkIPs, err := ParsePodNetworkAnnotation(f, podYaml)
if nil != err {
if err != nil {
return fmt.Errorf("failed to parse pod %s/%s network annotation \n pod yaml %v, \n error: %v ", podNS, podName, podYaml, err)
}

Expand Down Expand Up @@ -917,9 +918,11 @@ func CheckIppoolSanity(f *frame.Framework, poolName string) error {
wep, err := GetWorkloadByName(f, podYaml.Namespace, podYaml.Name)
if err != nil {
if api_errors.IsNotFound(err) {
GinkgoLogr.Error(fmt.Errorf("endpoint %s/%s dose not exist", podYaml.Namespace, podYaml.Name), "Failed")
GinkgoLogr.Error(fmt.Errorf("pod %s/%s exists in ippool %s, but endpoint does not exist", podYaml.Namespace, podYaml.Name, poolName), "Failed")
isSanity = false
continue
}
return fmt.Errorf("failed to get endpoint %s/%s, error %v", podYaml.Namespace, podYaml.Name, err)
return fmt.Errorf("pod %s/%s exists in ippool %s, but failed to get endpoint, error %v", podYaml.Namespace, podYaml.Name, poolName, err)
}

podUsedIPs := convert.GroupIPAllocationDetails(wep.Status.Current.UID, wep.Status.Current.IPs)
Expand All @@ -941,24 +944,50 @@ func CheckIppoolSanity(f *frame.Framework, poolName string) error {
}
}

if *ippool.Status.AllocatedIPCount > *ippool.Status.TotalIPCount {
GinkgoWriter.Printf(
"allocated IP count (%v) exceeds total IP count (%v) \n",
*ippool.Status.AllocatedIPCount, *ippool.Status.TotalIPCount,
)
isSanity = false
}
// The status of IPPool is automatically synchronized by the IPPool informer based on the events it receives.
// In the CI environment, the creation of IPPools happens very quickly, and their health checks are performed promptly.
// When checking the TotalIPCount status, if the spiderpool-controller undergoes a leader election or the informer has not yet completed synchronization,
// the IPPool status TotalIPCount may be nil. This can lead to a panic.
// In such cases, try waiting for the informer to complete status synchronization before checking the robustness of the IPPool.
ctx, cancel := context.WithTimeout(context.Background(), InformerSyncStatusTime)
defer cancel()
for {
select {
case <-ctx.Done():
return fmt.Errorf("waiting for informer to synchronize IPPool %s status timed out", poolName)
default:
if ippool.Status.AllocatedIPCount == nil || ippool.Status.TotalIPCount == nil {
GinkgoLogr.Error(fmt.Errorf("IPPool %s has nil status fields, retrying", poolName), "Failed")
ippool, err = GetIppoolByName(f, poolName)
if err != nil {
if api_errors.IsNotFound(err) {
return fmt.Errorf("ippool %s does not exist", poolName)
}
return fmt.Errorf("failed to get ippool %s, error %v", poolName, err)
}
time.Sleep(ForcedWaitingTime)
continue
}

// Ensure that the IP pool's reported usage matches the actual usage
if actualIPUsageCount != int(*ippool.Status.AllocatedIPCount) {
GinkgoWriter.Printf("IPPool %s usage count mismatch: expected %d, got %d \n", poolName, actualIPUsageCount, *ippool.Status.AllocatedIPCount)
isSanity = false
}
if *ippool.Status.AllocatedIPCount > *ippool.Status.TotalIPCount {
GinkgoWriter.Printf(
"allocated IP count (%v) exceeds total IP count (%v) \n",
*ippool.Status.AllocatedIPCount, *ippool.Status.TotalIPCount,
)
isSanity = false
}
// Ensure that the IP pool's reported usage matches the actual usage
if actualIPUsageCount != int(*ippool.Status.AllocatedIPCount) {
GinkgoWriter.Printf("IPPool %s usage count mismatch: expected %d, got %d \n", poolName, actualIPUsageCount, *ippool.Status.AllocatedIPCount)
isSanity = false
}

if !isSanity {
return fmt.Errorf("IPPool %s sanity check failed", poolName)
}
if !isSanity {
return fmt.Errorf("IPPool %s sanity check failed", poolName)
}

GinkgoWriter.Printf("Successfully checked IPPool %s sanity, IPPool record information is correct \n", poolName)
return nil
GinkgoWriter.Printf("Successfully checked IPPool %s sanity, IPPool record information is correct \n", poolName)
return nil
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ var (
request *kdoctorV1beta1.NetHttpRequest
condition *kdoctorV1beta1.NetSuccessCondition
schedule *kdoctorV1beta1.SchedulePlan
run = true
)

var _ = BeforeSuite(func() {
Expand Down
Loading

0 comments on commit a7a4272

Please sign in to comment.