3b4aa88cd517164a41c237f3b25937407f14ea21
[integration/test.git] / csit / suites / netconf / clusteringscale / staggered_install.robot
1 *** Settings ***
2 Documentation     Suite for controlled installation of ${FEATURE_ONCT}
3 ...
4 ...               Copyright (c) 2016 Cisco Systems, Inc. and others. All rights reserved.
5 ...
6 ...               This program and the accompanying materials are made available under the
7 ...               terms of the Eclipse Public License v1.0 which accompanies this distribution,
8 ...               and is available at http://www.eclipse.org/legal/epl-v10.html
9 ...
10 ...
11 ...               This suite requires odl-netconf-ssh feature to be already installed,
12 ...               otherwise SSH bundle refresh will cause connection to drop and karaf command "fails".
13 ...
14 ...               Operation of clustered netconf topology relies on two key services.
15 ...               The netconf topology manager application, which runs on the member
16 ...               which owns "topology-manager" entity (of "netconf-topoogy" type);
17 ...               And config datastore shard for network-topology module,
18 ...               which is controlled by the Leader of the config topology shard.
19 ...               The Leader is providing the desired state (concerning Netconf connectors),
20 ...               the Owner consumes the state, performs necessary actions and updated operational view.
21 ...               In this suite, the common name for the Owner and the Leader is Manager.
22 ...
23 ...               In a typical cluster High Availability testing scenario,
24 ...               one cluster member is selected, killed (or isolated), and later re-started (re-joined).
25 ...               For Netconf cluster topology testing, there will be scenarios tragetting
26 ...               the Owner, and other scenarios targeting the Leader.
27 ...
28 ...               But both Owner and Leader selection is overned by the same RAFT algorithm,
29 ...               which relies on message ordering, so there are two typical cases.
30 ...               Either one member becomes both Owner and Leader,
31 ...               or the two Managers are located at random.
32 ...
33 ...               As the targeted scenarios require the two Managers to reside on different members,
34 ...               neither of the two case is beneficial for testing.
35 ...
36 ...               There are APIs in place which should allow relocation of Leader,
37 ...               but there are no system tests for them yet.
38 ...               TODO: Study those APIs and create the missing system tests.
39 ...
40 ...               This suite helps with the Manager placement situation
41 ...               by performing feature installation in runtime, aplying the following strategy:
42 ...
43 ...               A N-node cluster is started (without ${FEATURE_ONCT} installed),
44 ...               and it is verified one node has become the Leader of topology config shard.
45 ...               As ${FEATURE_ONCT} is installed on the (N-1) follower members
46 ...               (but not on the Leader yet), it is expected one of the members
47 ...               becomes Owner of topology-manager entity.
48 ...               After verifying that, ${FEATURE_ONCT} is installed on the Leader.
49 ...               If neither Owner nor Leader has moved, the desired placement has been created.
50 ...
51 ...               More specifically, this suite assumes the cluster has been started,
52 ...               it has been stabilized, and ${FEATURE_ONCT} is not installed anywhere.
53 ...               After successful run of this suite, the feature is installed on each member,
54 ...               and the Owner is verified to be placed on different member than the Leader.
55 ...
56 ...               Note that stress tests may cause Akka delays, which may move the Managers around.
57 Suite Setup       Setup_Everything
58 Suite Teardown    Teardown_Everything
59 Test Setup        SetupUtils.Setup_Test_With_Logging_And_Without_Fast_Failing
60 Test Teardown     SetupUtils.Teardown_Test_Show_Bugs_If_Test_Failed
61 Default Tags      clustering    netconf    critical
62 Resource          ${CURDIR}/../../../libraries/CarPeople.robot
63 Resource          ${CURDIR}/../../../libraries/ClusterManagement.robot
64 Resource          ${CURDIR}/../../../libraries/SetupUtils.robot
65 Resource          ${CURDIR}/../../../libraries/WaitForFailure.robot
66
67 *** Variables ***
68 ${FEATURE_ONCT}    odl-netconf-clustered-topology    # the feature name is mentioned multiple times, this is to prevent typos
69 ${OWNER_ELECTION_TIMEOUT}    180s    # very large value to allow for -all- jobs with many feature installations taking up time
70
71 *** Test Cases ***
72 Locate_Leader
73     [Documentation]    Set suite variables based on where the Leader is.
74     ...    As this test may get executed just after cluster restart, WUKS is used to give ODL chance to elect Leaders.
75     BuiltIn.Comment    FIXME: Migrate Set_Variables_For_Shard to ClusterManagement.robot
76     BuiltIn.Wait_Until_Keyword_Succeeds    3m    15s    CarPeople.Set_Variables_For_Shard    shard_name=topology    shard_type=config
77
78 Install_Feature_On_Followers
79     [Documentation]    Perform feature installation on follower members, one by one.
80     ...    As first connection attempt may fail (coincidence with ssh bundle refresh), WUKS is used.
81     # Make sure this works, alternative is to perform the installation in parallel.
82     BuiltIn.Wait_Until_Keyword_Succeeds    3x    1s    ClusterManagement.Install_Feature_On_List_Or_All    feature_name=${FEATURE_ONCT}    member_index_list=${topology_follower_indices}    timeout=60s
83
84 Locate_Owner
85     [Documentation]    Wait for Owner to appear, store its index to suite variable.
86     BuiltIn.Wait_Until_Keyword_Succeeds    ${OWNER_ELECTION_TIMEOUT}    3s    Single_Locate_Owner_Attempt    member_index_list=${topology_follower_indices}
87
88 Install_Feature_On_Leader
89     [Documentation]    Perform feature installation on the Leader member.
90     ...    This seem to be failing, so use TRACE log.
91     ClusterManagement.Install_Feature_On_Member    feature_name=${FEATURE_ONCT}    member_index=${topology_leader_index}    timeout=60s
92
93 Verify_Managers_Are_Stationary
94     [Documentation]    Keep checking that Managers do not move for a while.
95     WaitForFailure.Verify_Keyword_Does_Not_Fail_Within_Timeout    ${OWNER_ELECTION_TIMEOUT}    1s    Check_Manager_Positions
96
97 *** Keywords ***
98 Setup_Everything
99     [Documentation]    Initialize libraries and set suite variables.
100     SetupUtils.Setup_Utils_For_Setup_And_Teardown
101     ClusterManagement.ClusterManagement_Setup
102
103 Teardown_Everything
104     [Documentation]    Teardown the test infrastructure, perform cleanup and release all resources.
105     RequestsLibrary.Delete_All_Sessions
106
107 Single_Locate_Owner_Attempt
108     [Arguments]    ${member_index_list}=${EMPTY}
109     [Documentation]    Performs actions on given (or all) members, one by one:
110     ...    For the first member listed: Get the actual owner, check candidates, store owner to suite variable.
111     ...    (If the list has less then one item, this Keyword will fail.)
112     ...    For other nodes: Get actual owner, check candidates, compare to the first listed member results.
113     BuiltIn.Comment    FIXME: Work with sorted candidte list instead of candidate list length.
114     ${index_list} =    ClusterManagement.ClusterManagement__Given_Or_Internal_Index_List    ${member_index_list}
115     ${require_candidate_list} =    BuiltIn.Create_List    @{index_list}
116     ${first_index_listed} =    Collections.Remove_From_List    ${index_list}    ${0}
117     # Now ${index_list} contains only the rest of indices.
118     ${netconf_manager_owner_index}    ${candidates} =    ClusterManagement.Get_Owner_And_Candidates_For_Type_And_Id    type=topology-netconf    id=/general-entity:entity[general-entity:name='topology-manager']    member_index=${first_index_listed}    require_candidate_list=${require_candidate_list}
119     BuiltIn.Set_Suite_Variable    \${netconf_manager_owner_index}
120     : FOR    ${index}    IN    @{index_list}
121     \    ${new_owner}    ${new_candidates} =    ClusterManagement.Get_Owner_And_Candidates_For_Type_And_Id    type=topology-netconf    id=/general-entity:entity[general-entity:name='topology-manager']    member_index=${index}
122     \    ...    require_candidate_list=${require_candidate_list}
123     \    BuiltIn.Should_Be_Equal    ${new_owner}    ${netconf_manager_owner_index}    Member-${index} owner ${new_owner} is not ${netconf_manager_owner_index}
124
125 Check_Manager_Positions
126     [Documentation]    For each Manager, locate its current position and check it is the one stored in suite variable.
127     ${new_leader}    ${followers} =    ClusterManagement.Get_Leader_And_Followers_For_Shard    shard_name=topology    shard_type=config
128     BuiltIn.Should_Be_Equal    ${topology_leader_index}    ${new_leader}
129     ${new_owner}    ${candidates} =    ClusterManagement.Get_Owner_And_Candidates_For_Type_And_Id    type=topology-netconf    id=/general-entity:entity[general-entity:name='topology-manager']    member_index=${topology_first_follower_index}
130     BuiltIn.Should_Be_Equal    ${netconf_manager_owner_index}    ${new_owner}