Merge pull request #3143 from edx/michael/dependencies

Michael/dependencies

Merge pull request #3143 from edx/michael/dependencies
Michael/dependencies
2261475d · MichaelRoytman · GitHub · 3d9445f6 · c9ab8e12 · 2261475d
Commit 2261475d authored Jul 01, 2016 by MichaelRoytman Committed by GitHub Jul 01, 2016
Hide whitespace changes
Inline Side-by-side

Showing with 381 additions and 1 deletions

docker.mk
+1 -1

requirements.txt
+2 -0

util/parsefiles.py
+371 -0

util/parsefiles_config.yml
+7 -0

No files found.
--- a/docker.mk
+++ b/docker.mk
@@ -4,7 +4,7 @@ SHARD=0
 SHARDS=1

 dockerfiles:=$(shell ls docker/build/*/Dockerfile)
-images:=$(patsubst docker/build/%/Dockerfile,%,$(dockerfiles))
+images:=$(shell git diff --name-only $(TRAVIS_COMMIT_RANGE) | python util/parsefiles.py)

 docker_build=docker.build.
 docker_test=docker.test.

--- a/requirements.txt
+++ b/requirements.txt
@@ -13,6 +13,8 @@ prettytable==0.7.2
 awscli==1.10.28
 requests==2.9.1
 datadog==0.8.0
+networkx==1.11
+pathlib2==2.1.0

 # Needed for the mongo_* modules (playbooks/library/mongo_*)
 pymongo==3.1

--- a/util/parsefiles.py
+++ b/util/parsefiles.py
+import os
+import pathlib2
+import logging
+import yaml
+import sys
+import networkx as nx
+from collections import namedtuple
+import argparse
+
+TRAVIS_BUILD_DIR = os.environ.get("TRAVIS_BUILD_DIR")
+DOCKER_PATH_ROOT = pathlib2.Path(TRAVIS_BUILD_DIR, "docker", "build")
+CONFIG_FILE_PATH = pathlib2.Path(TRAVIS_BUILD_DIR, "util", "parsefiles_config.yml")
+LOGGER = logging.getLogger(__name__)
+
+def build_graph(git_dir, roles_dirs, aws_play_dirs, docker_play_dirs):
+    """
+    Builds a dependency graph that shows relationships between roles and playbooks.
+    An edge [A, B], where A and B are roles, signifies that A depends on B. An edge
+    [C, D], where C is a playbook and D is a role, signifies that C uses D.
+
+    Input:
+    git_dir: A path to the top-most directory in the local git repository tool is to be run in.
+    roles_dirs: A list of relative paths to directories in which Ansible roles reside.
+    aws_play_dirs: A list of relative paths to directories in which AWS Ansible playbooks reside.
+    docker_play_dirs: A list of relative paths to directories in which Docker Ansible playbooks reside.
+
+    """
+
+    graph = nx.DiGraph()
+
+    _map_roles_to_roles(graph, roles_dirs, git_dir, "dependencies", "role", "role")
+    _map_plays_to_roles(graph, aws_play_dirs, git_dir, "roles", "aws_playbook", "role")
+    _map_plays_to_roles(graph, docker_play_dirs, git_dir, "roles", "docker_playbook", "role")
+
+    return graph
+
+def _map_roles_to_roles(graph, dirs, git_dir, key, type_1, type_2):
+    """
+    Maps roles to the roles that they depend on.
+
+    Input:
+    graph: A networkx digraph that is used to map Ansible dependencies.
+    dirs: A list of relative paths to directories in which Ansible roles reside.
+    git_dir: A path to the top-most directory in the local git repository tool is to be run in.
+    key: The key in a role yaml file in dirs that maps to relevant role data. In this case, key is
+        "dependencies", because a role's dependent roles is of interest.
+    type_1: Given edges A-B, the type of node A.
+    type_2: Given edges A-B, the type of node B.
+        Since this function maps roles to their dependent roles, both type_1 and type_2 are "role".
+    """
+
+    Node = namedtuple('Node', ['name', 'type'])
+
+    # for each role directory
+    for d in dirs:
+        d = pathlib2.Path(git_dir, d)
+
+        # for all files/sub-directories in directory
+        for item in d.iterdir():
+
+            # attempts to find meta/*.yml file in item directory tree
+            roles = [f for f in item.glob("meta/*.yml")]
+
+            # if a meta/*.yml file(s) exists for a role
+            if roles:
+                # for each role
+                for role in roles:
+                    yaml_file = _open_yaml_file(role)
+
+                    # if not an empty yaml file and key in file
+                    if yaml_file is not None and key in yaml_file:
+                        # for each dependent role; yaml_file["dependencies"] returns list of
+                        # dependent roles
+                        for dependent in yaml_file[key]:
+                            # get role name of each dependent role
+                            name = _get_role_name(dependent)
+
+                            # add node for type_1, typically role
+                            node_1 = Node(item.name, type_1)
+
+                            # add node for type_2, typically dependent role
+                            node_2 = Node(name, type_2)
+
+                            # add edge, typically role - dependent role
+                            graph.add_edge(node_1, node_2)
+
+def _map_plays_to_roles(graph, dirs, git_dir, key, type_1, type_2):
+    """
+    Maps plays to the roles they use.
+
+    Input:
+    graph: A networkx digraph that is used to map Ansible dependencies.
+    dirs: A list of relative paths to directories in which Ansible playbooks reside.
+    git_dir: A path to the top-most directory in the local git repository tool is to be run in.
+    key: The key in a playbook yaml file in dirs that maps to relevant playbook data. In this case, key is
+        "roles", because the roles used by a playbook is of interest.
+    type_1: Given edges A-B, the type of node A.
+    type_2: Given edges A-B, the type of node B.
+        Since this function maps plays to the roles they use, both type_1 is a type of playbook and type_2 is "role".
+    """
+
+    Node = namedtuple('Node', ['name', 'type'])
+
+    # for each play directory
+    for d in dirs:
+        d = pathlib2.Path(git_dir, d)
+
+        # for all files/sub-directories in directory
+        for item in d.iterdir():
+
+            # if item is a file ending in .yml
+            if item.match("*.yml"):
+                # open .yml file for playbook
+                yaml_file = _open_yaml_file(item)
+
+                # if not an empty yaml file
+                if yaml_file is not None:
+                    # for each play in yaml file
+                    for play in yaml_file:
+                        # if specified key in yaml file (e.g. "roles")
+                        if key in play:
+                            # for each role
+                            for role in play[key]:
+                                # get role name
+                                name = _get_role_name(role)
+
+                                #add node for type_1, typically for playbook
+                                node_1 = Node(item.stem, type_1)
+
+                                # add node for type_2, typically for role
+                                node_2 = Node(name, type_2)
+
+                                 # add edge, typically playbook - role it uses
+                                graph.add_edge(node_1, node_2)
+
+def _open_yaml_file(file_str):
+    """
+    Opens yaml file.
+
+    Input:
+    file_str: The path to yaml file to be opened.
+    """
+
+    with (file_str.open(mode='r')) as file:
+        try:
+            yaml_file = yaml.load(file)
+            return yaml_file
+        except yaml.YAMLError, exc:
+            LOGGER.warning("error in configuration file: %s" % str(exc))
+            sys.exit(1)
+
+def change_set_to_roles(files, git_dir, roles_dirs, playbooks_dirs, graph):
+    """
+    Converts change set consisting of a number of files to the roles that they represent/contain.
+
+    Input:
+    files: A list of files modified by a commit range.
+    git_dir: A path to the top-most directory in the local git repository tool is to be run in.
+    roles_dirs: A list of relative paths to directories in which Ansible roles reside.
+    playbook_dirs: A list of relative paths to directories in which Ansible playbooks reside.
+    graph: A networkx digraph that is used to map Ansible dependencies.
+    """
+
+    # set of roles
+    items = set()
+
+    # for all directories containing roles
+    for role_dir in roles_dirs:
+        role_dir_path = pathlib2.Path(git_dir, role_dir)
+
+        # get all files in the directories containing roles (i.e. all the roles in that directory)
+        candidate_files = (f for f in role_dir_path.glob("**/*"))
+
+        # for all the files in the change set
+        for f in files:
+            file_path = pathlib2.Path(git_dir, f)
+
+            # if the change set file is in the set of role files
+            if file_path in candidate_files:
+                # get name of role and add it to set of roles of the change set
+                items.add(_get_resource_name(file_path, "roles"))
+
+    # for all directories containing playbooks
+    for play_dir in playbooks_dirs:
+        play_dir_path = pathlib2.Path(git_dir, play_dir)
+
+        # get all files in directory containing playbook that end with yml extension
+        # (i.e. all playbooks in that directory)
+        candidate_files = (f for f in play_dir_path.glob("*.yml"))
+
+        # for all filse in the change set
+        for f in files:
+            file_path = pathlib2.Path(git_dir, f)
+
+            # if the change set file is in teh set of playbook files
+            if file_path in candidate_files:
+
+                # gets first level of children of playbook in graph, which represents
+                # all roles the playbook uses
+                descendants = nx.all_neighbors(graph, (file_path.stem, "aws_playbook"))
+
+                # adds all the roles that a playbook uses to set of roles of the change set
+                items |= {desc.name for desc in descendants}
+    return items
+
+def _get_resource_name(path, kind):
+    """
+    Gets name of resource from the filepath, which is the directory following occurence of kind.
+
+    Input:
+    path: A path to the resource (e.g. a role or a playbook)
+    kind: A description of the type of resource; this keyword precedes the name of a role or a playbook
+        in a file path and allows for the separation of its name;
+        e.g. for "configuration/playbooks/roles/discovery/...", kind = "roles" returns
+        "discovery" as the role name
+    """
+    # get individual parts of a file path
+    dirs = path.parts
+
+    # type of resource is the next part of the file path after kind (e.g. after "roles" or "playbooks")
+    return dirs[dirs.index(kind)+1]
+
+def get_dependencies(roles, graph):
+    """
+    Determines all roles dependent on set of roles and returns set containing both.
+
+    Input:
+    roles: A set of roles.
+    graph: A networkx digraph that is used to map Ansible dependencies.
+    """
+
+    items = set()
+
+    for role in roles:
+        # add the role itself
+        items.add(role)
+
+        # add all the roles that depend on the role
+        dependents = nx.descendants(graph, (role, "role"))
+
+        items |= {dependent.name for dependent in dependents}
+
+    return items
+
+def get_docker_plays(roles, graph):
+    """Gets all docker plays that contain at least role in common with roles."""
+
+    # dict to determine coverage of plays
+    coverage = dict.fromkeys(roles, False)
+
+    items = set()
+
+    docker_plays = (node.name for node in graph.nodes() if node.type == "docker_playbook")
+
+    for play in docker_plays:
+        # all roles that are used by play
+        roles_nodes = nx.all_neighbors(graph, (play, "docker_playbook"))
+
+        docker_roles = {role.name for role in roles_nodes}
+
+        # compares roles and docker roles
+        common_roles = roles & docker_roles
+
+        # if their intersection is non-empty, add the docker role
+        if common_roles:
+            items.add(play)
+
+            # each aws role that was in common is marked as being covered by a docker play
+            for role in common_roles:
+                coverage[role] = True
+
+    # check coverage of roles
+    for role in coverage:
+        if not coverage[role]:
+            LOGGER.warning("role '%s' is not covered." % role)
+
+    return items
+
+def filter_docker_plays(plays, repo_path):
+    """Filters out docker plays that do not have a Dockerfile."""
+
+    items = set()
+
+    for play in plays:
+        dockerfile = pathlib2.Path(DOCKER_PATH_ROOT, play, "Dockerfile")
+
+        if dockerfile.exists():
+            items.add(play)
+        else:
+            LOGGER.warning("covered playbook '%s' does not have Dockerfile." % play)
+
+    return items
+
+def _get_role_name(role):
+    """
+    Resolves a role name from either a simple declaration or a dictionary style declaration.
+
+    A simple declaration would look like:
+    - foo
+
+    A dictionary style declaration would look like:
+    - role: rbenv
+      rbenv_user: "{{ forum_user }}"
+      rbenv_dir: "{{ forum_app_dir }}"
+      rbenv_ruby_version: "{{ forum_ruby_version }}"
+
+    :param role:
+    :return:
+    """
+    if isinstance(role, dict):
+        return role['role']
+    elif isinstance(role, basestring):
+        return role
+    else:
+        LOGGER.warning("role %s could not be resolved to a role name." % role)
+        return None
+
+def arg_parse():
+
+    parser = argparse.ArgumentParser(description = 'Given a commit range, analyze Ansible dependencies between roles and playbooks '
+    'and output a list of Docker plays affected by this commit range via these dependencies.')
+    parser.add_argument('--verbose', help="set warnings to be displayed", action="store_true")
+
+    return parser.parse_args()
+
+if __name__ == '__main__':
+
+    args = arg_parse()
+
+    # configure logging
+    logging.basicConfig()
+
+    if not args.verbose:
+        logging.disable(logging.WARNING)
+
+    # set of modified files in the commit range
+    change_set = set()
+
+    # read from standard in
+    for line in sys.stdin:
+        change_set.add(line.rstrip())
+
+    # configuration file is expected to be in the following format:
+    #
+    # roles_paths:
+    #       - <all paths relative to configuration repository that contain Ansible roles>
+    # aws_plays_paths:
+    #       - <all paths relative to configuration repository that contain aws Ansible playbooks>
+    # docker_plays_paths:
+    #       - <all paths relative to configuration repositroy that contain Docker Ansible playbooks>
+
+    # read config file
+    config = _open_yaml_file(CONFIG_FILE_PATH)
+
+    # build graph
+    graph = build_graph(TRAVIS_BUILD_DIR, config["roles_paths"], config["aws_plays_paths"], config["docker_plays_paths"])
+
+    # transforms list of roles and plays into list of original roles and the roles contained in the plays
+    roles = change_set_to_roles(change_set, TRAVIS_BUILD_DIR, config["roles_paths"], config["aws_plays_paths"], graph)
+
+    # expands roles set to include roles that are dependent on existing roles
+    dependent_roles = get_dependencies(roles, graph)
+
+    # determine which docker plays cover at least one role
+    docker_plays = get_docker_plays(dependent_roles, graph)
+
+    # filter out docker plays without a Dockerfile
+    docker_plays = filter_docker_plays(docker_plays, TRAVIS_BUILD_DIR)
+
+    # prints Docker plays
+    print " ".join(str(play) for play in docker_plays)
--- a/util/parsefiles_config.yml
+++ b/util/parsefiles_config.yml
+roles_paths:
+  - playbooks/roles
+aws_plays_paths:
+  - playbooks
+  - playbooks/edx-east
+docker_plays_paths:
+  - docker/plays