Skip to content

Commit

Permalink
ci: config vpn, add harvester terraform script, and implement harvest…
Browse files Browse the repository at this point in the history
…er vm reboot operation

Signed-off-by: Yang Chiu <[email protected]>
  • Loading branch information
yangchiu committed May 31, 2024
1 parent ef6d01c commit 58759d2
Show file tree
Hide file tree
Showing 16 changed files with 404 additions and 36 deletions.
3 changes: 2 additions & 1 deletion e2e/libs/host/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from host.host import Host
from host.aws import Aws
from host.harvester import Harvester
15 changes: 3 additions & 12 deletions e2e/libs/host/host.py → e2e/libs/host/aws.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,16 @@
import boto3
import time
import yaml

from host.constant import NODE_REBOOT_DOWN_TIME_SECOND

from node.node import Node

from utility.utility import logging
from utility.utility import wait_for_cluster_ready
from host.base import Base


class Host:
class Aws(Base):

def __init__(self):
with open('/tmp/instance_mapping', 'r') as f:
self.mapping = yaml.safe_load(f)
super().__init__()
self.aws_client = boto3.client('ec2')

self.node = Node()

def reboot_all_nodes(self, shut_down_time_in_sec=NODE_REBOOT_DOWN_TIME_SECOND):
instance_ids = [value for value in self.mapping.values()]

Expand Down Expand Up @@ -93,4 +85,3 @@ def power_on_node(self, power_on_node_name):
waiter = self.aws_client.get_waiter('instance_running')
waiter.wait(InstanceIds=instance_ids)
logging(f"Started instances")

32 changes: 32 additions & 0 deletions e2e/libs/host/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import yaml
from abc import ABC, abstractmethod
from node.node import Node


class Base(ABC):

def __init__(self):
with open('/tmp/instance_mapping', 'r') as f:
self.mapping = yaml.safe_load(f)
self.node = Node()

@abstractmethod
def reboot_all_nodes(self, shut_down_time_in_sec):
return NotImplemented

@abstractmethod
def reboot_node(self, node_name, shut_down_time_in_sec):
return NotImplemented

@abstractmethod
def reboot_all_worker_nodes(self, shut_down_time_in_sec):
return NotImplemented

@abstractmethod
def power_off_node(self, node_name):
return NotImplemented

@abstractmethod
def power_on_node(self, node_name):
return NotImplemented

92 changes: 92 additions & 0 deletions e2e/libs/host/harvester.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import requests
import os
import time
from host.constant import NODE_REBOOT_DOWN_TIME_SECOND
from utility.utility import logging
from utility.utility import wait_for_cluster_ready
from utility.utility import get_retry_count_and_interval
from host.base import Base
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

class Harvester(Base):

def __init__(self):
super().__init__()
self.url = f"{os.getenv('LAB_URL')}/k8s/clusters/{os.getenv('LAB_CLUSTER_ID')}/v1/harvester/kubevirt.io.virtualmachines/longhorn-qa"
self.cookies = {
'R_SESS': f"{os.getenv('LAB_ACCESS_KEY')}:{os.getenv('LAB_SECRET_KEY')}"
}
self.retry_count, self.retry_interval = get_retry_count_and_interval()

def reboot_all_nodes(self, shut_down_time_in_sec=NODE_REBOOT_DOWN_TIME_SECOND):
node_names = [key for key in self.mapping.keys()]

for node_name in node_names:
self.power_off_node(node_name)

logging(f"Wait for {shut_down_time_in_sec} seconds before starting vms")
time.sleep(shut_down_time_in_sec)

for node_name in node_names:
self.power_on_node(node_name)

wait_for_cluster_ready()

def reboot_node(self, node_name, shut_down_time_in_sec):
self.power_off_node(node_name)

logging(f"Wait for {shut_down_time_in_sec} seconds before starting vms")
time.sleep(shut_down_time_in_sec)

self.power_on_node(node_name)

def reboot_all_worker_nodes(self, shut_down_time_in_sec):
node_names = self.node.list_node_names_by_role("worker")

for node_name in node_names:
self.power_off_node(node_name)

logging(f"Wait for {shut_down_time_in_sec} seconds before starting vms")
time.sleep(shut_down_time_in_sec)

for node_name in node_names:
self.power_on_node(node_name)

def power_off_node(self, node_name):
vm_id = self.mapping[node_name]

url = f"{self.url}/{vm_id}"
resp = requests.post(f"{url}?action=stop", cookies=self.cookies, verify=False)
logging(f"resp = {resp}")
assert resp.status_code == 204, f"Failed to stop vm {vm_id} response: {resp.status_code} {resp.reason}, request: {resp.request.url} {resp.request.headers}"
logging(f"Stopping vm {vm_id}")

stopped = False
for i in range(self.retry_count):
logging(f"Waiting for vm {vm_id} stopped ... ({i})")
resp = requests.get(url, cookies=self.cookies, verify=False)
if "Stopped" in resp.json()['metadata']['fields']:
stopped = True
break
time.sleep(self.retry_interval)
assert stopped, f"Expected vm {vm_id} to be stopped but it's not"

def power_on_node(self, node_name):
vm_id = self.mapping[node_name]

url = f"{self.url}/{vm_id}"
resp = requests.post(f"{url}?action=start", cookies=self.cookies, verify=False)
logging(f"resp = {resp}")
assert resp.status_code == 204, f"Failed to start vm {vm_id} response: {resp.status_code} {resp.reason}, request: {resp.request.url} {resp.request.headers}"
logging(f"Starting vm {vm_id}")

started = False
for i in range(self.retry_count):
logging(f"Waiting for vm {vm_id} started ... ({i})")
resp = requests.get(url, cookies=self.cookies, verify=False)
if "Running" in resp.json()['metadata']['fields']:
started = True
break
time.sleep(self.retry_interval)
assert started, f"Expected vm {vm_id} to be started but it's not"
12 changes: 9 additions & 3 deletions e2e/libs/keywords/host_keywords.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
from robot.libraries.BuiltIn import BuiltIn

from host import Host
from host import Harvester, Aws
from host.constant import NODE_REBOOT_DOWN_TIME_SECOND

from node import Node
Expand All @@ -12,8 +13,13 @@ class host_keywords:

def __init__(self):
self.volume_keywords = BuiltIn().get_library_instance('volume_keywords')

self.host = Host()
host_provider = os.getenv('HOST_PROVIDER')
if host_provider == "aws":
self.host = Aws()
elif host_provider == "harvester":
self.host = Harvester()
else:
raise Exception(f"Unsupported host provider {host_provider}")
self.node = Node()

def reboot_node_by_index(self, idx, power_off_time_in_min=1):
Expand Down
9 changes: 5 additions & 4 deletions e2e/libs/longhorn.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ def _get_raw(self, url, data=None):

def _get_response(self, url, data=None):
r = self._session.get(url, auth=self._auth, params=data,
headers=self._headers)
headers=self._headers, timeout=DEFAULT_TIMEOUT)
if r.status_code < 200 or r.status_code >= 300:
self._error(r.text, r.status_code)

Expand All @@ -302,7 +302,7 @@ def _get_response(self, url, data=None):
@timed_url
def _post(self, url, data=None):
r = self._session.post(url, auth=self._auth, data=self._marshall(data),
headers=self._headers)
headers=self._headers, timeout=DEFAULT_TIMEOUT)
if r.status_code < 200 or r.status_code >= 300:
self._error(r.text, r.status_code)

Expand All @@ -311,15 +311,16 @@ def _post(self, url, data=None):
@timed_url
def _put(self, url, data=None):
r = self._session.put(url, auth=self._auth, data=self._marshall(data),
headers=self._headers)
headers=self._headers, timeout=DEFAULT_TIMEOUT)
if r.status_code < 200 or r.status_code >= 300:
self._error(r.text, r.status_code)

return self._unmarshall(r.text)

@timed_url
def _delete(self, url):
r = self._session.delete(url, auth=self._auth, headers=self._headers)
r = self._session.delete(url, auth=self._auth, headers=self._headers,
timeout=DEFAULT_TIMEOUT)
if r.status_code < 200 or r.status_code >= 300:
self._error(r.text, r.status_code)

Expand Down
13 changes: 13 additions & 0 deletions e2e/libs/utility/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,19 @@ def filter_cr(group, version, namespace, plural, field_selector="", label_select
except ApiException as e:
logging(f"Listing namespaced custom object: {e}")

def list_namespaced_pod(namespace, label_selector=""):
api = client.CoreV1Api()
retry_count, retry_interval = get_retry_count_and_interval()
for i in range(retry_count):
try:
resp = api.list_namespaced_pod(
namespace=namespace,
label_selector=label_selector)
return resp.items
except Exception as e:
logging(f"Failed to list namespaced {namespace} pods with error: {e}")
time.sleep(retry_interval)
assert False, f"Failed to list namespaced {namespace} pods"

def set_annotation(group, version, namespace, plural, name, annotation_key, annotation_value):
api = client.CustomObjectsApi()
Expand Down
9 changes: 4 additions & 5 deletions e2e/libs/workload/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from utility.utility import get_retry_count_and_interval
from utility.utility import logging

from utility.utility import list_namespaced_pod
from workload.pod import is_pod_terminated_by_kubelet
from workload.constant import WAIT_FOR_POD_STABLE_MAX_RETRY
from workload.constant import WAIT_FOR_POD_KEPT_IN_STATE_TIME
Expand All @@ -20,18 +20,17 @@ def get_workload_pod_names(workload_name):


def get_workload_pods(workload_name, namespace="default"):
api = client.CoreV1Api()
label_selector = f"app={workload_name}"
resp = api.list_namespaced_pod(
pods = list_namespaced_pod(
namespace=namespace,
label_selector=label_selector)

if not resp.items:
if len(pods) == 0:
logging(f"No pods found for workload {workload_name} in namespace {namespace}")
return []

filtered_pods = []
for pod in resp.items:
for pod in pods:
# https://github.com/longhorn/longhorn/issues/8550#issuecomment-2109276522
if is_pod_terminated_by_kubelet(pod):
logging(f"Skipping pod {pod.metadata.name} because it is terminated by kubelet")
Expand Down
2 changes: 1 addition & 1 deletion pipelines/e2e/Dockerfile.setup
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ RUN wget -q https://storage.googleapis.com/kubernetes-release/release/$KUBECTL_V
wget -q "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64" && \
mv yq_linux_amd64 /usr/local/bin/yq && \
chmod +x /usr/local/bin/yq && \
apk add openssl openssh-client ca-certificates git rsync bash curl jq python3 py3-pip gcc python3-dev libc-dev py3-virtualenv docker && \
apk add openssl openssh-client ca-certificates git rsync bash curl jq python3 py3-pip gcc python3-dev libc-dev py3-virtualenv docker openvpn moreutils && \
ssh-keygen -t rsa -b 4096 -N "" -f ~/.ssh/id_rsa && \
curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 && \
chmod 700 get_helm.sh && \
Expand Down
13 changes: 12 additions & 1 deletion pipelines/e2e/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ node {
usernamePassword(credentialsId: CREDS_ID, passwordVariable: 'AWS_SECRET_KEY', usernameVariable: 'AWS_ACCESS_KEY'),
string(credentialsId: 'DO_CREDS', variable: 'DO_TOKEN'),
string(credentialsId: REGISTRATION_CODE_ID, variable: 'REGISTRATION_CODE'),
file(credentialsId: 'vpn.ovpn', variable: 'VPN_CONFIG'),
file(credentialsId: 'login.conf', variable: 'LOGIN_CONFIG'),
usernamePassword(credentialsId: 'LAB_API_KEY', passwordVariable: 'LAB_SECRET_KEY', usernameVariable: 'LAB_ACCESS_KEY'),
string(credentialsId: 'LAB_URL', variable: 'LAB_URL'),
]) {

if (params.SEND_SLACK_NOTIFICATION) {
Expand Down Expand Up @@ -76,7 +80,8 @@ node {
echo "Using registration coce: $REGISTRATION_CODE_ID"

sh "pipelines/e2e/scripts/build.sh"
sh """ docker run -itd --name ${JOB_BASE_NAME}-${BUILD_NUMBER} \
sh """ docker run -itd --cap-add=NET_ADMIN \
--name ${JOB_BASE_NAME}-${BUILD_NUMBER} \
--env AIR_GAP_INSTALLATION=${AIR_GAP_INSTALLATION} \
--env REGISTRY_URL=${REGISTRY_URL} \
--env REGISTRY_USERNAME=${REGISTRY_USERNAME} \
Expand Down Expand Up @@ -118,11 +123,17 @@ node {
--env TF_VAR_cis_hardening=${CIS_HARDENING} \
--env TF_VAR_resources_owner=longhorn-long-running \
--env TF_VAR_extra_block_device=${RUN_V2_TEST} \
--env TF_VAR_lab_url=${LAB_URL} \
--env TF_VAR_lab_access_key=${LAB_ACCESS_KEY} \
--env TF_VAR_lab_secret_key=${LAB_SECRET_KEY} \
--env IMAGE_NAME=${imageName} \
-v /var/run/docker.sock:/var/run/docker.sock \
--mount source="vol-${imageName}",target=/tmp \
${imageName}
"""

sh "docker cp ${VPN_CONFIG} ${JOB_BASE_NAME}-${BUILD_NUMBER}:/src/longhorn-tests/vpn.ovpn"
sh "docker cp ${LOGIN_CONFIG} ${JOB_BASE_NAME}-${BUILD_NUMBER}:/src/longhorn-tests/login.conf"
}

timeout(60) {
Expand Down
4 changes: 3 additions & 1 deletion pipelines/utilities/kubeconfig.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
set_kubeconfig(){
# rke2, rke and k3s all support amd64
# but only k3s supports arm64
if [[ "${TF_VAR_arch}" == "amd64" ]] ; then
if [[ "${LONGHORN_TEST_CLOUDPROVIDER}" == "harvester" ]]; then
export KUBECONFIG="${PWD}/test_framework/kube_config.yaml"
elif [[ "${TF_VAR_arch}" == "amd64" ]]; then
if [[ "${TF_VAR_k8s_distro_name}" == "rke" ]]; then
export KUBECONFIG="${PWD}/test_framework/kube_config_rke.yml"
elif [[ "${TF_VAR_k8s_distro_name}" == "rke2" ]]; then
Expand Down
6 changes: 6 additions & 0 deletions pipelines/utilities/run_longhorn_e2e_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ run_longhorn_e2e_test_out_of_cluster(){
cp "${KUBECONFIG}" /tmp/kubeconfig
CONTAINER_NAME="e2e-container-${IMAGE_NAME}"
docker run --pull=always \
--network=container:"${IMAGE_NAME}" \
--name "${CONTAINER_NAME}" \
-e LONGHORN_BACKUPSTORE="${LONGHORN_BACKUPSTORES}" \
-e LONGHORN_BACKUPSTORE_POLL_INTERVAL="${LONGHORN_BACKUPSTORE_POLL_INTERVAL}" \
Expand All @@ -88,6 +89,11 @@ run_longhorn_e2e_test_out_of_cluster(){
-e AWS_DEFAULT_REGION="${TF_VAR_aws_region}" \
-e LONGHORN_CLIENT_URL="${LONGHORN_CLIENT_URL}" \
-e KUBECONFIG="/tmp/kubeconfig" \
-e HOST_PROVIDER="${LONGHORN_TEST_CLOUDPROVIDER}" \
-e LAB_URL="${TF_VAR_lab_url}" \
-e LAB_ACCESS_KEY="${TF_VAR_lab_access_key}" \
-e LAB_SECRET_KEY="${TF_VAR_lab_secret_key}" \
-e LAB_CLUSTER_ID="$(cat /tmp/cluster_id)" \
--mount source="vol-${IMAGE_NAME}",target=/tmp \
"${LONGHORN_TESTS_CUSTOM_IMAGE}" "${ROBOT_COMMAND_ARGS[@]}"
docker stop "${CONTAINER_NAME}"
Expand Down
27 changes: 19 additions & 8 deletions pipelines/utilities/terraform_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

set -x

if [[ ${LONGHORN_TEST_CLOUDPROVIDER} == "harvester" ]]; then
source pipelines/utilities/vpn.sh
connect_to_vpn
fi

if [[ ${TF_VAR_arch} == "amd64" ]]; then
terraform -chdir=test_framework/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} init
terraform -chdir=test_framework/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} apply -auto-approve -no-color
Expand All @@ -16,14 +21,20 @@ else
terraform -chdir=test_framework/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} apply -auto-approve -no-color
fi

if [[ "${TF_VAR_create_load_balancer}" == true ]]; then
terraform -chdir=test_framework/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw load_balancer_url > test_framework/load_balancer_url
fi

if [[ "${TF_VAR_k8s_distro_name}" == "k3s" ]]; then
terraform -chdir=test_framework/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw instance_mapping | jq 'map({(.name | split(".")[0]): .id}) | add' | jq -s add > /tmp/instance_mapping
if [[ ${LONGHORN_TEST_CLOUDPROVIDER} == "aws" ]]; then
if [[ "${TF_VAR_create_load_balancer}" == true ]]; then
terraform -chdir=test_framework/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw load_balancer_url > test_framework/load_balancer_url
fi
if [[ "${TF_VAR_k8s_distro_name}" == "k3s" ]]; then
terraform -chdir=test_framework/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw instance_mapping | jq 'map({(.name | split(".")[0]): .id}) | add' | jq -s add > /tmp/instance_mapping
fi
terraform -chdir=test_framework/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw controlplane_public_ip > /tmp/controlplane_public_ip
elif [[ ${LONGHORN_TEST_CLOUDPROVIDER} == "harvester" ]]; then
terraform -chdir=test_framework/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw kube_config > test_framework/kube_config.yaml
terraform -chdir=test_framework/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw cluster_id > /tmp/cluster_id
KUBECONFIG=${PWD}/test_framework/kube_config.yaml kubectl get nodes --no-headers --selector=node-role.kubernetes.io/control-plane -owide | awk '{print $6}' > /tmp/controlplane_public_ip
KUBECONFIG=${PWD}/test_framework/kube_config.yaml kubectl get nodes --no-headers -ojson | jq '.items[].metadata.name' | tr -d '"' > /tmp/instance_mapping
jq -Rn 'reduce inputs as $line ({}; .[$line] = $line)' /tmp/instance_mapping | sponge /tmp/instance_mapping
fi

terraform -chdir=test_framework/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw controlplane_public_ip > /tmp/controlplane_public_ip

exit $?
Loading

0 comments on commit 58759d2

Please sign in to comment.