From 9e5484607f5d66df7e872554fb617014c86b3147 Mon Sep 17 00:00:00 2001
From: Yassine Ilmi <Yassine.Ilmi@thomsonreuters.com>
Date: Wed, 26 Jun 2024 21:06:54 +0100
Subject: [PATCH 1/3] GitHub Inventory module

---
 .../inventory/github_inventory/.env.example   |   6 +
 scripts/inventory/github_inventory/README.md  |  87 +++++
 .../github_inventory/__init__.py              |   0
 .../github_inventory/__main__.py              |   3 +
 .../github_inventory/config.py                |  12 +
 .../organization_repositories.graphql         |  70 ++++
 .../repository_details.graphql                | 135 ++++++++
 .../github_inventory/github_inventory/main.py | 308 ++++++++++++++++++
 .../inventory/github_inventory/pyproject.toml |  18 +
 .../inventory/github_inventory/settings.toml  |  10 +
 .../github_inventory/tests/__init__.py        |   0
 11 files changed, 649 insertions(+)
 create mode 100644 scripts/inventory/github_inventory/.env.example
 create mode 100644 scripts/inventory/github_inventory/README.md
 create mode 100644 scripts/inventory/github_inventory/github_inventory/__init__.py
 create mode 100644 scripts/inventory/github_inventory/github_inventory/__main__.py
 create mode 100644 scripts/inventory/github_inventory/github_inventory/config.py
 create mode 100644 scripts/inventory/github_inventory/github_inventory/graphql_queries/organization_repositories.graphql
 create mode 100644 scripts/inventory/github_inventory/github_inventory/graphql_queries/repository_details.graphql
 create mode 100644 scripts/inventory/github_inventory/github_inventory/main.py
 create mode 100644 scripts/inventory/github_inventory/pyproject.toml
 create mode 100644 scripts/inventory/github_inventory/settings.toml
 create mode 100644 scripts/inventory/github_inventory/tests/__init__.py
diff --git a/scripts/inventory/github_inventory/.env.example b/scripts/inventory/github_inventory/.env.example
new file mode 100644
index 0000000..cfe4376
--- /dev/null
+++ b/scripts/inventory/github_inventory/.env.example
@@ -0,0 +1,6 @@
+GITHUB_INVENTORY_TOKEN=""
+GITHUB_INVENTORY_ORG=""
+# GITHUB_INVENTORY_PR=false
+# GITHUB_INVENTORY_ISSUES=false
+# GITHUB_INVENTORY_QGL_QUERY_ORG=
+# GITHUB_INVENTORY_QGL_QUERY_REPO=
diff --git a/scripts/inventory/github_inventory/README.md b/scripts/inventory/github_inventory/README.md
new file mode 100644
index 0000000..8a5755e
--- /dev/null
+++ b/scripts/inventory/github_inventory/README.md
@@ -0,0 +1,87 @@
+# GitHub Inventory
+
+This directory contains scripts for interacting with the GitHub GraphQL API to fetch information about a GitHub organization's repositories, pull requests, and issues, including labels for each issue and pull request. It also allows to filter on the issues and pull requests fetched based on the labels.
+
+The script support configuration, in order of priority through command line arguments, environment variables when prefixed by GITHUB_INVENTORY_<PARAMETER> or through a TOML configuration file.
+
+## Files
+
+- `github_inventory/main.py`: This is the main script. It uses the GitHub GraphQL API to fetch all repositories and optionnally their pull requests, issues, topics and languages for a specified GitHub organization, handling pagination to fetch all results even if there are more than the API's maximum limit per request.
+
+- `.env.example`: This is an example of the .env file that could be used to set up environment variables. This will override manually set environment variables, rename it to .env with your values.
+
+- `settings.toml`: This is a configuration file that can be used to specify the organization to fetch data from, the labels to filter on, and the fields to fetch for each repository, pull request, and issue. The script will use first the command line arguments, then the environment variables, then the values in this file.
+
+- `pyproject.toml`: This is the configuration file for the `poetry` package manager, which is used to manage dependencies for the script.
+
+
+- `github_inventory/graphql_queries/*`: Contains the GraphQL queries used to fetch the data. These can be copied and modified befgore being passed as a custom gql query.
+
+## Requirements
+
+- Python 3.9 or higher
+- [`poetry` package manager](https://python-poetry.org/docs/#installation)
+
+## Dependencies
+
+- dynaconf = "^3.2.5"
+- requests = "^2.32.3"
+- python-dotenv = "^1.0.1"
+
+## Usage
+
+### Examples
+
+1. To run, setup the two environment variables
+```bash
+read GITHUB_INVENTORY_TOKEN
+<YOUR TOKEN>
+export GITHUB_INVENTORY_TOKEN=$GITHUB_INVENTORY_TOKEN
+export GITHUB_ORG=<your org> # You can also pass this directly throigh
+```
+
+2. Setup the poetry environment
+```bash
+poetry install
+```
+
+3. Run the script with Python:
+
+```bash
+poetry run python -m github_inventory
+```
+
+OR
+
+```bash
+poetry run python -m github_inventory --organization <organization> --issues --pr --issues_labels my-issue-label1 --pr_labels my-pr-label
+
+```
+
+4. The script will write the fetched data to a JSON file `inventory-<organization>.json`.
+
+### Supported parameters
+
+> All parameters are supported as environment variables, the module expects them to be set with the `GITHUB_INVENTORY_` prefix
+
+The following parameters are supported
+| Parameter            | CLI                    | settings.toml          | Environment variable               | Default |
+|----------------------|------------------------|------------------------|------------------------------------|---------|
+| GitHub Organization  | `--org <ORG>`          | org = "<ORG>"        | `GITHUB_INVENTORY_ORG`             | ""    |
+| Dot Env file         | `--env <FILE>`         | dotenv_path = "<PATH>" | `GITHUB_INVENTORY_DOTENV_PATH`   | .env  |
+| Pull Issues          | `--issues`             | issues = false       | `GITHUB_INVENTORY_ISSUES`          | false |
+| Pull PRs             | `--pr`                 | pr = false           | `GITHUB_INVENTORY_PR`              | false |
+| Custom Org GQL Query | `--gql-query-org <FILE>` | gql_query_org = ""  | `GITHUB_INVENTORY_GQL_QUERY_ORG`   | ""    |
+| Custom Repo GQL Query| `--gql-query-repo <FILE>` | gql_query_repo = "" | `GITHUB_INVENTORY_GQL_QUERY_REPO` | ""    |
+
+
+
+# Notes about the current GraphQL queries
+
+We had to compromise on the data we fetch from GitHub to avoid hitting the API rate limit with costly queries. For example, fetching the first 100 repositories of an organization is a cheap query, fetching the first 100 repositories with their issues and pull requests doesn't cost that much either. But the GitHub GraphQL API may take too long to return the data in cases where all the repos had 100 issues and 100 pull requests, and that each had labels.
+
+This is why we decided to fetch only the first 100 repositories of an organization, and then fetch their issues and pull requests through a separate query, doing so also allows to keep the cost of fetching labels for each issue low. Fetching objects that are three level deep causes expensive queries, and sometimes are not allowed as they could result in millions of objects being pulled. For example fetching a 100 repository page at the org level and for each 100 PRs and Issues pages, each with the first 10 assignee and labels costs 404 requests on the rate limit.
+
+For similar reasons, we don't fetch the bodyText of issues and pull requests, as this has resulted in many timeouts or errors on large test organization. While this means that fetching the repositories list, their issues, pull requests, topics, and languages takes a while, it ensures that we stay within the API rate limits and avoid unnecessary delays and errors.
+
+The script is designed to be flexible and configurable to meet the needs of different organizations and use cases. By adjusting the command line arguments, environment variables, or configuration file settings, users can tailor the script to fetch exactly the data they need.
diff --git a/scripts/inventory/github_inventory/github_inventory/__init__.py b/scripts/inventory/github_inventory/github_inventory/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/inventory/github_inventory/github_inventory/__main__.py b/scripts/inventory/github_inventory/github_inventory/__main__.py
new file mode 100644
index 0000000..88ca76a
--- /dev/null
+++ b/scripts/inventory/github_inventory/github_inventory/__main__.py
@@ -0,0 +1,3 @@
+from github_inventory.main import main
+
+main(argv=None)
diff --git a/scripts/inventory/github_inventory/github_inventory/config.py b/scripts/inventory/github_inventory/github_inventory/config.py
new file mode 100644
index 0000000..a026745
--- /dev/null
+++ b/scripts/inventory/github_inventory/github_inventory/config.py
@@ -0,0 +1,12 @@
+from dynaconf import Dynaconf, Validator
+
+settings = Dynaconf(
+    envvar_prefix="GITHUB_INVENTORY",
+    load_dotenv=True,
+    settings_files=["settings.toml", ".secrets.toml"],
+    validate_on_update=True,
+)
+settings.validators.register(
+    Validator("TOKEN", required=True, is_type_of=str, len_min=1),
+    Validator("ORG", required=True, is_type_of=str, len_min=1),
+)
diff --git a/scripts/inventory/github_inventory/github_inventory/graphql_queries/organization_repositories.graphql b/scripts/inventory/github_inventory/github_inventory/graphql_queries/organization_repositories.graphql
new file mode 100644
index 0000000..d20f443
--- /dev/null
+++ b/scripts/inventory/github_inventory/github_inventory/graphql_queries/organization_repositories.graphql
@@ -0,0 +1,70 @@
+query OrganizationRepositories($org: String!,
+$organizationCursor: String
+) {
+    organization(login: $org) {
+        name
+        repositories(first: 100, after: $organizationCursor) {
+            totalCount
+            nodes {
+                id
+                name
+                description
+                createdAt
+                pushedAt
+                updatedAt
+                diskUsage
+                isArchived
+                isDisabled
+                isEmpty
+                isPrivate
+                visibility
+                nameWithOwner
+                owner {
+                    login
+                }
+                defaultBranchRef {
+                    name
+                }
+                primaryLanguage {
+                    name
+                }
+                # This will slow down the query resulting in timeouts and errors for large GitHub organizations
+                # Feel free to enable it if you want to fetch this information for your organization
+                # pullRequests(first: 0) {
+                #     totalCount
+                # }
+                # issues(first: 0) {
+                #     totalCount
+                # }
+                # We also limit the number of topics and languages to 5 to avoid timeouts, but use a custom query to change it
+                repositoryTopics(first: 5) {
+                    totalCount
+                    nodes {
+                        topic {
+                            name
+                        }
+                    }
+                }
+                languages (first: 5) {
+                    totalCount
+                    totalSize
+                    edges {
+                        node {
+                            name
+                        }
+                        size
+                    }
+                }
+            }
+            pageInfo {
+                endCursor
+                hasNextPage
+            }
+        }
+    }
+    rateLimit {
+        cost
+        remaining
+        resetAt
+    }
+}
diff --git a/scripts/inventory/github_inventory/github_inventory/graphql_queries/repository_details.graphql b/scripts/inventory/github_inventory/github_inventory/graphql_queries/repository_details.graphql
new file mode 100644
index 0000000..b29a5b8
--- /dev/null
+++ b/scripts/inventory/github_inventory/github_inventory/graphql_queries/repository_details.graphql
@@ -0,0 +1,135 @@
+query RepositoryDetails(
+  $org: String!
+  $repositoryName: String!
+  $pullRequestCursor: String
+  $issueCursor: String
+  $pullRequestStep: Int
+  $issueStep: Int
+  $pullRequestsLabel: [String!]
+  $issuesLabel: [String!]
+) {
+  repository(owner: $org, name: $repositoryName) {
+    id
+    name
+    description
+    createdAt
+    pushedAt
+    updatedAt
+    diskUsage
+    isArchived
+    isDisabled
+    isEmpty
+    isPrivate
+    visibility
+    nameWithOwner
+    owner {
+      login
+    }
+    defaultBranchRef {
+      name
+    }
+    primaryLanguage {
+      name
+    }
+    pullRequests(
+      first: $pullRequestStep
+      after: $pullRequestCursor
+      labels: $pullRequestsLabel
+    ) {
+      totalCount
+      nodes {
+        id
+        number
+        title
+        bodyText
+        isDraft
+        merged
+        mergedAt
+        publishedAt
+        url
+        state
+        author {
+          login
+        }
+        baseRefName
+        baseRepository {
+          name
+        }
+        headRefName
+        createdAt
+        updatedAt
+        # We assume that
+        labels(first: 10) {
+          nodes {
+            name
+          }
+        }
+        assignees(first: 10) {
+          nodes {
+            login
+          }
+        }
+      }
+      pageInfo {
+        endCursor
+        hasNextPage
+      }
+    }
+    issues(first: $issueStep, after: $issueCursor, labels: $issuesLabel) {
+      totalCount
+      nodes {
+        id
+        number
+        title
+        bodyText
+        publishedAt
+        url
+        state
+        author {
+          login
+        }
+        createdAt
+        updatedAt
+        closedAt
+        labels(first: 10) {
+          nodes {
+            name
+          }
+        }
+        assignees(first: 10) {
+          nodes {
+            login
+          }
+        }
+      }
+      pageInfo {
+        endCursor
+        hasNextPage
+      }
+    }
+    # We decided to limit the number of topics and languages to 10, use a custom query to change it
+    repositoryTopics(first: 10) {
+      totalCount
+      nodes {
+        topic {
+          name
+        }
+      }
+    }
+    languages(first: 10) {
+      totalCount
+      totalSize
+      edges {
+        node {
+          name
+        }
+        size
+      }
+    }
+  }
+  rateLimit {
+    cost
+    remaining
+    resetAt
+  }
+}
diff --git a/scripts/inventory/github_inventory/github_inventory/main.py b/scripts/inventory/github_inventory/github_inventory/main.py
new file mode 100644
index 0000000..d2c89c4
--- /dev/null
+++ b/scripts/inventory/github_inventory/github_inventory/main.py
@@ -0,0 +1,308 @@
+from __future__ import annotations
+import argparse
+import requests
+import json
+import time
+import sys
+import logging
+from pathlib import Path
+from github_inventory.config import settings
+from dynaconf.validator import ValidationError
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[logging.FileHandler("github_inventory.log"), logging.StreamHandler()],
+)
+
+
+# https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28
+# https://docs.github.com/en/rest/guides/best-practices-for-integrators?apiVersion=2022-11-28#dealing-with-secondary-rate-limits
+def handle_rate_limit(session, client_response):
+    retry_after = client_response.headers.get("Retry-After")
+    rate_limit_remaining = client_response.headers.get("x-ratelimit-remaining")
+    rate_limit_type = client_response.headers.get("x-ratelimit-resource")
+
+    if rate_limit_remaining and rate_limit_remaining == 0:
+        logging.error(
+            "First rate limit hit, waiting for limit reset before continuing."
+        )
+        response = session.get("https://api.github.com/rate_limit")
+        if not (200 <= response.status_code <= 299):
+            logging.error("Error occurred while checking rate limit.")
+            return None
+        rate_limit_details = response.json()["resources"][
+            rate_limit_type if rate_limit_type else "graphql"
+        ]
+        reset_in_minutes = (rate_limit_details["reset"] - int(time.time())) / 60
+        logging.info(
+            f"Current rate limit of {rate_limit_details['limit']} hit, waiting for limit to expires in {round(reset_in_minutes,0)} minutes."
+        )
+        while int(time.time()) <= int(client_response.headers.get("x-ratelimit-reset")):
+            time.sleep(60)
+        return True
+    if retry_after:
+        logging.error(
+            f'Second rate limit hit, waiting {retry_after} seconds before retrying (based on "Retry-After" header).'
+        )
+        time.sleep(int(retry_after))
+        return True
+    return False
+
+
+def run_query(session, query, variables=None, retry_count=5):
+    response = session.post(
+        "https://api.github.com/graphql", json={"query": query, "variables": variables}
+    )
+    if not (200 <= response.status_code <= 299):
+        if handle_rate_limit(session, response) and retry_count > 0:
+            logging.info(
+                f"Rate limit hit, retrying request. Retries left: {retry_count - 1}"
+            )
+            return run_query(session, query, variables, retry_count - 1)
+        elif response.status_code in (502, 504) and retry_count > 0:
+            logging.error(
+                f"GraphQL returned a 502 or 504, this may be caused by a timeout, retrying {retry_count + 1} more times before failing"
+            )
+            return run_query(session, query, variables, retry_count - 1)
+        elif response.status_code in (401, 403):
+            logging.error(
+                "Unauthorized access, please check your GitHub access token and organization name."
+            )
+            sys.exit(1)
+        else:
+            logging.error(
+                f"Received HTTP Response Status code {response.status_code} while performing request. Response: {response.text}."
+            )
+        return None
+    json_response = response.json()
+    if "errors" in json_response:
+        logging.error(
+            f"Error occurred while performing request: {json_response['errors']}"
+        )
+        logging.error(f"Query: {query}")
+        logging.error(f"Variables: {variables}")
+        return None
+    return json_response
+
+
+def get_repository_details(session, repository, query):
+    filter_pr, filter_issues = settings.pr_labels, settings.issues_labels
+    # Let's first to try to fetch the repository with 100 pull requests and 100 issues, if there are more we'll handle pagination
+    repository_variables = {
+        "org": settings.org,
+        "repositoryName": repository["name"],
+        "pullRequestCursor": "",
+        "issueCursor": "",
+        "pullRequestStep": 100 if settings.pr else 0,
+        "issueStep": 100 if settings.issues else 0,
+        "pullRequestsLabel": filter_pr,
+        "issuesLabel": filter_issues,
+    }
+    repository = run_query(session, query, repository_variables)["data"]["repository"]
+
+    pr_page_info = (
+        repository["pullRequests"]["pageInfo"]
+        if settings.pr
+        else {"hasNextPage": False}
+    )
+    while pr_page_info["hasNextPage"]:
+        # We don't want to fetch issues here, so setting issueStep to 0 to reduce the cost of the query by one
+        pr_variables = repository_variables.copy()
+        pr_variables.update(
+            {
+                "pullRequestCursor": pr_page_info["endCursor"],
+                "pullRequestStep": 100,
+                "issueStep": 0,
+                "pullRequestsLabel": filter_pr,
+                "issuesLabel": "",
+            }
+        )
+
+        pr_result = run_query(session, query, pr_variables)
+        repository["pullRequests"]["nodes"].extend(
+            pr_result["data"]["repository"]["pullRequests"]["nodes"]
+        )
+        pr_page_info = pr_result["data"]["repository"]["pullRequests"]["pageInfo"]
+
+    # Handle pagination for issues
+    issue_page_info = (
+        repository["issues"]["pageInfo"] if settings.issues else {"hasNextPage": False}
+    )
+    while issue_page_info["hasNextPage"]:
+        # We don't want to fetch pull requests, so in this case we're setting the pullRequestStep to 0
+        issues_variables = repository_variables.copy()
+        issues_variables.update(
+            {
+                "issueCursor": issue_page_info["endCursor"],
+                "pullRequestStep": 0,
+                "issueStep": 100,
+                "pullRequestsLabel": "",
+                "issuesLabel": filter_issues,
+            }
+        )
+        issue_result = run_query(session, query, issues_variables)
+        repository["issues"]["nodes"].extend(
+            issue_result["data"]["repository"]["issues"]["nodes"]
+        )
+        issue_page_info = issue_result["data"]["repository"]["issues"]["pageInfo"]
+    yield repository
+
+
+def get_repositories(session, queries):
+    org_cursor = ""
+    totalCount = None
+    organization_variables = {"org": settings.org, "organizationCursor": org_cursor}
+    while True:
+        organization_variables.update({"organizationCursor": org_cursor})
+        result = run_query(session, queries["org"], organization_variables)
+        if not totalCount:
+            totalCount = {
+                "initialCount": result["data"]["organization"]["repositories"][
+                    "totalCount"
+                ],
+                "currentCount": 0,
+            }
+
+        totalCount["currentCount"] += len(
+            result["data"]["organization"]["repositories"]["nodes"]
+        )
+
+        logging.info(
+            f"Progress: {totalCount['currentCount']} / {totalCount['initialCount']} - Total repositories left to fetch: {totalCount['initialCount'] - totalCount['currentCount']}."
+        )
+        for repository in result["data"]["organization"]["repositories"]["nodes"]:
+            if settings.pr or settings.issues:
+                yield from get_repository_details(
+                    session,
+                    repository,
+                    queries["repository"],
+                )
+            else:
+                yield repository
+
+        page_info = result["data"]["organization"]["repositories"]["pageInfo"]
+        if page_info["hasNextPage"]:
+            org_cursor = page_info["endCursor"]
+        else:
+            break
+
+
+def clean_up(object):
+    result = {}
+    for k, v in object.items():
+        if isinstance(v, (str, int, bool, list)) or v is None:
+            result[k] = v
+        elif isinstance(v, dict):
+            if "login" in v:
+                result[k] = v["login"]
+            elif "name" in v:
+                result[k] = v["name"]
+            elif "edges" in v and v["edges"]:
+                result[k] = {
+                    "totalCount": v["totalCount"],
+                    "totalSize": v["totalSize"],
+                    "edges": [
+                        {"name": edge["node"]["name"], "size": edge["size"]}
+                        for edge in v["edges"]
+                    ],
+                }
+            elif "nodes" in v:
+
+                result[k] = [clean_up(node) for node in v["nodes"]]
+        else:
+            pass
+    return result
+
+
+def main(argv=None):
+    current_script_directory = Path(__file__).parent.__str__() + "/"
+    parser = argparse.ArgumentParser(description="GitHub inventory script")
+
+    try:
+        parser.add_argument(
+            "--env",
+            default=settings.get("dotenv_path", ".env"),
+            help="Path to .env file",
+        )
+        parser.add_argument(
+            "--org", default=settings.get("org"), help="GitHub organization name"
+        )
+        parser.add_argument(
+            "--pr",
+            default=settings.get("pr"),
+            action="store_true",
+            help="Fetch pull requests",
+        )
+        parser.add_argument(
+            "--issues",
+            default=settings.get("issues"),
+            action="store_true",
+            help="Fetch issues",
+        )
+        parser.add_argument("--pr-labels", help="Filter pull requests by label")
+        parser.add_argument("--issues-labels", help="Filter issues by label")
+        parser.add_argument(
+            "--gql-query-org",
+            default=settings.get(
+                "gql_query_org",
+                current_script_directory
+                + "graphql_queries/organization_repositories.graphql",
+            ),
+            help="Path to custom GraphQL query for fetching organization repositories",
+        )
+        parser.add_argument(
+            "--gql-query-repo",
+            default=settings.get(
+                "gql_query_repo",
+                current_script_directory + "graphql_queries/repository_details.graphql",
+            ),
+            help="Path to custom GraphQL query for fetching repository details",
+        )
+    except argparse.ArgumentError as e:
+        logging.error(f"Error occurred while parsing arguments: {e}")
+        sys.exit(1)
+    except AttributeError as e:
+        logging.error(f"Required argument is missing: {e}")
+        sys.exit(1)
+    options, args = parser.parse_known_args(argv)
+
+    settings.setenv(options.env)
+    github_token = settings.token
+
+    # Updates the dynaconfig settings
+    try:
+        settings.update(vars(options))
+    except ValidationError as e:
+        logging.error(f"One of the parameters provided is invalid: {e}")
+        sys.exit(1)
+
+    queries = {}
+    with open(settings.gql_query_org, "r") as f:
+        queries["org"] = f.read()
+    with open(settings.gql_query_repo, "r") as f:
+        queries["repository"] = f.read()
+
+    organization_name = settings.org
+
+    session = requests.Session()
+    headers = {
+        "Authorization": f"Bearer {github_token}",
+        "Content-Type": "application/json",
+    }
+    session.headers.update(headers)
+
+    inventory = []
+    for repo in get_repositories(session, queries):
+        repo = clean_up(repo)
+        inventory.append(repo)
+
+    session.close()
+    with open(f"inventory-{organization_name}.json", "w") as f:
+        json.dump(inventory, f, indent=4)
+    logging.info(f"Inventory saved to inventory-{organization_name}.json")
+
+
+if __name__ == "__main__":
+    main(argv=None)
diff --git a/scripts/inventory/github_inventory/pyproject.toml b/scripts/inventory/github_inventory/pyproject.toml
new file mode 100644
index 0000000..47036f2
--- /dev/null
+++ b/scripts/inventory/github_inventory/pyproject.toml
@@ -0,0 +1,18 @@
+[tool.poetry]
+name = "github-inventory"
+version = "0.1.0"
+description = "Scripts to pull and maintain a basic github inventory"
+authors = ["Yassine Ilmi <Yassine.Ilmi@thomsonreuters.com>"]
+license = "MIT"
+readme = "README.md"
+include = ["settings.toml", ".env.example", "graphql_queries/*.graphql"]
+
+[tool.poetry.dependencies]
+python = "^3.9"
+dynaconf = "^3.2.5"
+requests = "^2.32.3"
+python-dotenv = "^1.0.1"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/scripts/inventory/github_inventory/settings.toml b/scripts/inventory/github_inventory/settings.toml
new file mode 100644
index 0000000..3b6a540
--- /dev/null
+++ b/scripts/inventory/github_inventory/settings.toml
@@ -0,0 +1,10 @@
+# All these Settings can be set as environment variables, through a .env file, this settings file or command line ArgumentParser
+# The GITHUB_INVENTORY_TOKEN has to be set as an environment variable
+# Uncomment the settings you want to set here
+[default]
+# org = ""
+# dotenv_path = ""
+# issue = false
+# pr = false
+# gql_query_org = ""
+# gql_query_repo = ""
diff --git a/scripts/inventory/github_inventory/tests/__init__.py b/scripts/inventory/github_inventory/tests/__init__.py
new file mode 100644
index 0000000..e69de29

From 8d686a105f2677012fec86a7f24ca513266ad831 Mon Sep 17 00:00:00 2001
From: Yassine Ilmi <Yassine.Ilmi@thomsonreuters.com>
Date: Wed, 26 Jun 2024 21:06:59 +0100
Subject: [PATCH 2/3] SCM Inventory - GitHub Inventory IaC on AWS

---
 .../inventory/aws/scm-inventory/README.md     | 146 +++++++++++++++++
 .../inventory/aws/scm-inventory/iam.tf        | 113 +++++++++++++
 .../inventory/aws/scm-inventory/images.tf     |   9 ++
 .../inventory/aws/scm-inventory/locals.tf     |   4 +
 .../inventory/aws/scm-inventory/main.tf       |  50 ++++++
 .../inventory/aws/scm-inventory/outputs.tf    |  11 ++
 .../inventory/aws/scm-inventory/providers.tf  |  22 +++
 .../inventory/aws/scm-inventory/s3.tf         |  27 ++++
 .../inventory/aws/scm-inventory/s3.tfbackend  |   5 +
 .../inventory/aws/scm-inventory/secrets.tf    |   3 +
 .../aws/scm-inventory/security-groups.tf      |  21 +++
 .../inventory/aws/scm-inventory/sts.tf        |   1 +
 .../scm-inventory/terraform.tfvars.example    |  22 +++
 .../inventory/aws/scm-inventory/variables.tf  | 149 ++++++++++++++++++
 .../inventory/aws/scm-inventory/vpc.tf        |  17 ++
 15 files changed, 600 insertions(+)
 create mode 100644 infrastructure/inventory/aws/scm-inventory/README.md
 create mode 100644 infrastructure/inventory/aws/scm-inventory/iam.tf
 create mode 100644 infrastructure/inventory/aws/scm-inventory/images.tf
 create mode 100644 infrastructure/inventory/aws/scm-inventory/locals.tf
 create mode 100644 infrastructure/inventory/aws/scm-inventory/main.tf
 create mode 100644 infrastructure/inventory/aws/scm-inventory/outputs.tf
 create mode 100644 infrastructure/inventory/aws/scm-inventory/providers.tf
 create mode 100644 infrastructure/inventory/aws/scm-inventory/s3.tf
 create mode 100644 infrastructure/inventory/aws/scm-inventory/s3.tfbackend
 create mode 100644 infrastructure/inventory/aws/scm-inventory/secrets.tf
 create mode 100644 infrastructure/inventory/aws/scm-inventory/security-groups.tf
 create mode 100644 infrastructure/inventory/aws/scm-inventory/sts.tf
 create mode 100644 infrastructure/inventory/aws/scm-inventory/terraform.tfvars.example
 create mode 100644 infrastructure/inventory/aws/scm-inventory/variables.tf
 create mode 100644 infrastructure/inventory/aws/scm-inventory/vpc.tf

diff --git a/infrastructure/inventory/aws/scm-inventory/README.md b/infrastructure/inventory/aws/scm-inventory/README.md
new file mode 100644
index 0000000..ce87953
--- /dev/null
+++ b/infrastructure/inventory/aws/scm-inventory/README.md
@@ -0,0 +1,146 @@
+# SCM Inventory Module
+
+The SCM Inventory module is designed to automate the deployment of resources necessary for scanning SCM and pulling an inventory from such platforms. Initially it supports pullung GitHub organizations' repositories, their issues and pull requests to generate an inventory and maintain it.
+
+The inventory includes by default additional information about the top 5 languages used in the repository as well as the top 5 topics used. This information can be customized to include additional data.
+
+This Terraform module provisions an AWS EC2 instance, configures it with necessary permissions, and sets up a workflow to fetch GitHub inventory data and pushes it to an S3 bucket. The module is designed to be flexible and can be customized to support additional SCM platforms and data sources.
+
+## Supported SCM
+
+- GitHub: For more information see the python module [github_inventory](scripts/inventory/github_inventory/README.md) stored in this repository.
+
+## Prerequisites
+- AWS CLI configured with appropriate credentials
+- Access to an AWS account with permissions to create EC2 instances, IAM roles, policies, and S3 buckets
+- A GitHub token with permissions to access the repositories and organizations you wish to scan
+
+## Usage
+
+**Configure AWS Credentials**
+
+Ensure your AWS CLI is configured with credentials that have the necessary permissions to create the resources defined in this module.
+
+**Prepare GitHub Token**
+
+Store your GitHub token in AWS Secrets Manager. Note the ARN of the secret as it will be used in the Terraform variables.
+
+**Set Terraform Variables**
+
+Customize the Terraform variables defined in the variables.tf file or provide a terraform.tfvars file with your specific values.
+
+We recommend setting the variables in a terraform.tfvars file based off the [terraform.tfvars.example](infrastructure/inventory/aws/scm-inventory/deployment.tfvars.example) file provided.
+
+Key variables include:
+- aws_profile: The AWS profile to use for authentication.
+- aws_region: The AWS region where resources will be deployed.
+- s3_bucket_name: The name of the S3 bucket where the inventory will be stored. (This bucket must be created beforehand).
+- github_token_secret_name: The ARN of the AWS Secrets Manager secret containing your GitHub token. This will have to be provisonned separately
+- project_name: A name for your project.
+- scanned_org: The GitHub organization you wish to scan.
+
+**Initialize Terraform**
+
+Run terraform init in the infrastructure/inventory/aws/scm-inventory/ directory to initialize the Terraform project.
+
+**Apply Terraform Configuration**
+
+Execute terraform apply to create the resources. Review the plan and confirm the action.
+
+**Access the Inventory**
+
+Once the EC2 instance completes its run, the generated inventory will be available in the specified S3 bucket. The instance can be configured to terminate automatically after completion.
+
+**Additional Notes**
+
+The EC2 instance will use a `t2.micro` instance type by default, but this can be adjusted based on your needs. We didn't want to use a larger instance type by default to avoid unnecessary costs.
+
+It is also possible to keep the EC2 running after the inventory generation, which can be useful for debugging purposes. This can be done by setting the `terminate_instance_after_completion` variable to `false`.
+
+The module supports optional fetching of issues and pull requests from the scanned GitHub organizations by setting the fetch_issues and fetch_pr variables.
+
+The inventory script is located in the `scripts/inventory/github_inventory` directory.
+
+For detailed information on the resources created and managed by this module, refer to the automatically generated documentation below.
+
+
+<!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
+## Requirements
+
+| Name | Version |
+|------|---------|
+| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >=1.7 |
+| <a name="requirement_aws"></a> [aws](#requirement\_aws) | ~> 5.0 |
+
+## Providers
+
+| Name | Version |
+|------|---------|
+| <a name="provider_aws"></a> [aws](#provider\_aws) | ~> 5.0 |
+| <a name="provider_local"></a> [local](#provider\_local) | n/a |
+| <a name="provider_null"></a> [null](#provider\_null) | n/a |
+
+## Modules
+
+No modules.
+
+## Resources
+
+| Name | Type |
+|------|------|
+| [aws_iam_instance_profile.ec2_instance_profile](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_instance_profile) | resource |
+| [aws_iam_policy.permissions_for_ec2_instance](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource |
+| [aws_iam_policy.s3_access_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource |
+| [aws_iam_role.ec2_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource |
+| [aws_iam_role_policy_attachment.PermissionsForEC2InstancePolicyAttachment](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource |
+| [aws_instance.ec2_inventory](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/instance) | resource |
+| [aws_s3_object.poetry_dist](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/s3_object) | resource |
+| [null_resource.poetry_build](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
+| [aws_ami.amazon_ami](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source |
+| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source |
+| [aws_iam_policy_document.ec2_assume_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
+| [aws_iam_policy_document.policy_document_permissions_for_ec2_instance](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
+| [aws_iam_policy_document.s3_access_policy_document](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
+| [aws_s3_bucket.resources_and_results](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/s3_bucket) | data source |
+| [aws_secretsmanager_secret.github_token_secret](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret) | data source |
+| [aws_security_group.default](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/security_group) | data source |
+| [aws_security_groups.custom_security_groups](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/security_groups) | data source |
+| [aws_subnet.selected](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/subnet) | data source |
+| [aws_subnets.default](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/subnets) | data source |
+| [aws_vpc.selected](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/vpc) | data source |
+| [local_file.dist](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source |
+
+## Inputs
+
+| Name | Description | Type | Default | Required |
+|------|-------------|------|---------|:--------:|
+| <a name="input_ami_image_filter"></a> [ami\_image\_filter](#input\_ami\_image\_filter) | Filter to use to find the Amazon Machine Image (AMI) to use for the EC2 instance the name can contain wildcards. Only GNU/Linux images are supported. | `string` | `"amzn2-ami-hvm*"` | no |
+| <a name="input_ami_owner"></a> [ami\_owner](#input\_ami\_owner) | Owner of the Amazon Machine Image (AMI) to use for the EC2 instance | `string` | `"amazon"` | no |
+| <a name="input_aws_default_security_groups_filters"></a> [aws\_default\_security\_groups\_filters](#input\_aws\_default\_security\_groups\_filters) | Filters to use to find the default security groups | `list(string)` | `[]` | no |
+| <a name="input_aws_profile"></a> [aws\_profile](#input\_aws\_profile) | AWS profile to use for authentication | `string` | n/a | yes |
+| <a name="input_aws_region"></a> [aws\_region](#input\_aws\_region) | AWS region where to deploy resources | `string` | `"us-east-1"` | no |
+| <a name="input_ec2_workdir"></a> [ec2\_workdir](#input\_ec2\_workdir) | Working directory for the EC2 instance | `string` | `"~/github-inventory"` | no |
+| <a name="input_environment_type"></a> [environment\_type](#input\_environment\_type) | Environment (PRODUCTION, PRE-PRODUCTION, QUALITY ASSURANCE, INTEGRATION TESTING, DEVELOPMENT, LAB) | `string` | `"PRODUCTION"` | no |
+| <a name="input_fetch_issues"></a> [fetch\_issues](#input\_fetch\_issues) | Indicates whether to fetch issues for the repositories | `bool` | `false` | no |
+| <a name="input_fetch_pr"></a> [fetch\_pr](#input\_fetch\_pr) | Indicates whether to fetch pull requests for the repositories | `bool` | `false` | no |
+| <a name="input_github_token_secret_name"></a> [github\_token\_secret\_name](#input\_github\_token\_secret\_name) | SSM parameter name containing the GitHub token of the Service Account | `string` | n/a | yes |
+| <a name="input_instance_type"></a> [instance\_type](#input\_instance\_type) | Instance type to use for fetching the inventory | `string` | `"t2.micro"` | no |
+| <a name="input_inventory_project_dir"></a> [inventory\_project\_dir](#input\_inventory\_project\_dir) | Path to the directory containing the inventory project | `string` | `"../../../../scripts/inventory/github_inventory"` | no |
+| <a name="input_permissions_boundary_arn"></a> [permissions\_boundary\_arn](#input\_permissions\_boundary\_arn) | Permissions boundary to use for the IAM role | `string` | `null` | no |
+| <a name="input_project_name"></a> [project\_name](#input\_project\_name) | Name of the project | `string` | `"secrets-detection"` | no |
+| <a name="input_project_version"></a> [project\_version](#input\_project\_version) | Version of the project | `string` | `"0.1.0"` | no |
+| <a name="input_s3_bucket_name"></a> [s3\_bucket\_name](#input\_s3\_bucket\_name) | S3 bucket name where to upload the scripts and results | `string` | n/a | yes |
+| <a name="input_scanned_org"></a> [scanned\_org](#input\_scanned\_org) | Name of the organization to scan | `string` | n/a | yes |
+| <a name="input_subnet_name"></a> [subnet\_name](#input\_subnet\_name) | Filter to select the subnet to use, this can use wildcards. | `string` | `null` | no |
+| <a name="input_tags"></a> [tags](#input\_tags) | A map of tags to add to the resources | `map(string)` | `{}` | no |
+| <a name="input_terminate_instance_after_completion"></a> [terminate\_instance\_after\_completion](#input\_terminate\_instance\_after\_completion) | Indicates whether the instance should be terminated once the scan has finished (set to false for debugging purposes) | `bool` | `true` | no |
+| <a name="input_vpc_name"></a> [vpc\_name](#input\_vpc\_name) | Filter to select the VPC to use, this can use wildcards. | `string` | `""` | no |
+
+## Outputs
+
+| Name | Description |
+|------|-------------|
+| <a name="output_ec2_instance_arn"></a> [ec2\_instance\_arn](#output\_ec2\_instance\_arn) | n/a |
+| <a name="output_ec2_instance_id"></a> [ec2\_instance\_id](#output\_ec2\_instance\_id) | n/a |
+| <a name="output_ec2_role_arn"></a> [ec2\_role\_arn](#output\_ec2\_role\_arn) | n/a |
+<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/infrastructure/inventory/aws/scm-inventory/iam.tf b/infrastructure/inventory/aws/scm-inventory/iam.tf
new file mode 100644
index 0000000..d57eb9c
--- /dev/null
+++ b/infrastructure/inventory/aws/scm-inventory/iam.tf
@@ -0,0 +1,113 @@
+## Role assumable by EC2 instance
+data "aws_iam_policy_document" "ec2_assume_role" {
+  statement {
+    effect = "Allow"
+    principals {
+      identifiers = ["ec2.amazonaws.com"]
+      type        = "Service"
+    }
+    actions = ["sts:AssumeRole"]
+  }
+}
+
+resource "aws_iam_role" "ec2_role" {
+  name                 = "${var.project_name}-ec2-role"
+  assume_role_policy   = data.aws_iam_policy_document.ec2_assume_role.json
+  path                 = "/"
+  permissions_boundary = var.permissions_boundary_arn
+}
+
+resource "aws_iam_instance_profile" "ec2_instance_profile" {
+  name = "${var.project_name}-instance-profile"
+  role = aws_iam_role.ec2_role.name
+}
+
+data "aws_iam_policy_document" "policy_document_permissions_for_ec2_instance" {
+  # S3: Get and put objects in S3 bucket
+  statement {
+    sid       = "ListS3Bucket"
+    effect    = "Allow"
+    actions   = ["s3:ListBucket"]
+    resources = [data.aws_s3_bucket.resources_and_results.arn]
+  }
+
+  statement {
+    sid    = "GetAndPutObjectsInS3Bucket"
+    effect = "Allow"
+    actions = [
+      "s3:GetObject*",
+      "s3:PutObject*"
+    ]
+    resources = ["${data.aws_s3_bucket.resources_and_results.arn}/*"]
+  }
+
+  # Secrets Manager: Get GitHub API token
+
+  statement {
+    sid    = "FetchGitHubToken"
+    effect = "Allow"
+    actions = [
+      "secretsmanager:GetSecretValue",
+    ]
+    resources = ["arn:aws:secretsmanager:${var.aws_region}:${data.aws_caller_identity.current.account_id}:secret:${var.project_name}/${var.github_token_secret_name}-*"]
+  }
+
+  # EC2: Allow instance to schedule termination for itself (end of scan)
+  statement {
+    sid    = "AllowTerminationOfEC2Instance"
+    effect = "Allow"
+    actions = [
+      "ec2:TerminateInstances"
+    ]
+    resources = ["arn:aws:ec2:${var.aws_region}:${data.aws_caller_identity.current.account_id}:instance/*"]
+
+    condition {
+      test     = "StringLike"
+      variable = "aws:ResourceTag/Name"
+      values   = ["${var.project_name}*"]
+    }
+
+    condition {
+      test     = "StringLike"
+      variable = "ec2:InstanceProfile"
+      values   = [aws_iam_instance_profile.ec2_instance_profile.arn]
+    }
+  }
+}
+
+resource "aws_iam_policy" "permissions_for_ec2_instance" {
+  name        = "${var.project_name}-ec2-permissions"
+  description = "Policy granting necessary permissions to EC2 instance"
+  policy      = data.aws_iam_policy_document.policy_document_permissions_for_ec2_instance.json
+}
+
+resource "aws_iam_role_policy_attachment" "PermissionsForEC2InstancePolicyAttachment" {
+  policy_arn = aws_iam_policy.permissions_for_ec2_instance.arn
+  role       = aws_iam_role.ec2_role.name
+}
+
+
+data "aws_iam_policy_document" "s3_access_policy_document" {
+  statement {
+    sid       = "ListS3Bucket"
+    effect    = "Allow"
+    actions   = ["s3:ListBucket"]
+    resources = [data.aws_s3_bucket.resources_and_results.arn]
+  }
+
+  statement {
+    sid    = "GetAndListObjectsInS3Bucket"
+    effect = "Allow"
+    actions = [
+      "s3:GetObject*",
+      "s3:ListObject*"
+    ]
+    resources = ["${data.aws_s3_bucket.resources_and_results.arn}/*"]
+  }
+}
+
+resource "aws_iam_policy" "s3_access_policy" {
+  name        = "${var.project_name}-s3-access"
+  description = "Policy allowing to access the S3 bucket used for Trufflehog"
+  policy      = data.aws_iam_policy_document.s3_access_policy_document.json
+}
diff --git a/infrastructure/inventory/aws/scm-inventory/images.tf b/infrastructure/inventory/aws/scm-inventory/images.tf
new file mode 100644
index 0000000..ae4a371
--- /dev/null
+++ b/infrastructure/inventory/aws/scm-inventory/images.tf
@@ -0,0 +1,9 @@
+data "aws_ami" "amazon_ami" {
+  most_recent = true
+  owners      = [var.ami_owner]
+
+  filter {
+    name   = "name"
+    values = ["${var.ami_image_filter}"]
+  }
+}
diff --git a/infrastructure/inventory/aws/scm-inventory/locals.tf b/infrastructure/inventory/aws/scm-inventory/locals.tf
new file mode 100644
index 0000000..1068bab
--- /dev/null
+++ b/infrastructure/inventory/aws/scm-inventory/locals.tf
@@ -0,0 +1,4 @@
+locals {
+  environment = replace(lower(var.environment_type), " ", "-")
+  tags        = var.tags
+}
diff --git a/infrastructure/inventory/aws/scm-inventory/main.tf b/infrastructure/inventory/aws/scm-inventory/main.tf
new file mode 100644
index 0000000..a11674d
--- /dev/null
+++ b/infrastructure/inventory/aws/scm-inventory/main.tf
@@ -0,0 +1,50 @@
+
+resource "aws_instance" "ec2_inventory" {
+  ami                         = data.aws_ami.amazon_ami.id
+  instance_type               = var.instance_type
+  subnet_id                   = data.aws_subnet.selected.id
+  iam_instance_profile        = aws_iam_instance_profile.ec2_instance_profile.name
+  security_groups             = length(var.aws_default_security_groups_filters) > 0 ? data.aws_security_groups.custom_security_groups[0].ids : [data.aws_security_group.default[0].id]
+  user_data_replace_on_change = true
+  metadata_options {
+    http_tokens = "required"
+  }
+
+  root_block_device {
+    volume_size           = 30
+    volume_type           = "gp2"
+    delete_on_termination = true
+  }
+
+  user_data = join("\n", [
+    "#!/bin/bash",
+    "aws configure set region ${var.aws_region}",
+    "mkdir -p ${var.ec2_workdir}/github_inventory-${var.project_version}",
+    "aws s3 cp s3://${data.aws_s3_bucket.resources_and_results.id}/${aws_s3_object.poetry_dist.key} ${var.ec2_workdir}/",
+    "export GITHUB_INVENTORY_TOKEN=$(aws secretsmanager get-secret-value --secret-id ${data.aws_secretsmanager_secret.github_token_secret.arn} --query SecretString --output text)",
+    "tar -xvf ${var.ec2_workdir}/github_inventory-${var.project_version}.tar.gz -C ${var.ec2_workdir}",
+    "cd ${var.ec2_workdir}/github_inventory-${var.project_version}",
+    "virtualenv local",
+    "source local/bin/activate",
+    "pip3 install poetry",
+    "poetry lock && poetry install",
+    var.fetch_pr ? "export GITHUB_INVENTORY_PR=True" : "",
+    var.fetch_issues ? "export GITHUB_INVENTORY_ISSUES=True" : "",
+    "poetry run python -m github_inventory --org ${var.scanned_org}",
+    "aws s3 cp ${var.ec2_workdir}/github_inventory-${var.project_version}/inventory-${var.scanned_org}.json s3://${data.aws_s3_bucket.resources_and_results.id}/outbound/json/inventory-${var.scanned_org}.json",
+    "TOKEN=$(curl -X PUT \"http://169.254.169.254/latest/api/token\" -H \"X-aws-ec2-metadata-token-ttl-seconds: 21600\")",
+    "export INSTANCE_ID=$(curl -H \"X-aws-ec2-metadata-token: $TOKEN\" -s http://169.254.169.254/latest/meta-data/instance-id)",
+    var.terminate_instance_after_completion ? "aws ec2 terminate-instances --instance-ids $INSTANCE_ID" : ""
+  ])
+
+
+  tags = merge(var.tags, { Name = "${var.project_name}-ec2-${var.scanned_org}" })
+
+  depends_on = [
+    data.local_file.dist,
+    null_resource.poetry_build,
+    aws_s3_object.poetry_dist,
+    aws_iam_policy.permissions_for_ec2_instance,
+    aws_iam_role_policy_attachment.PermissionsForEC2InstancePolicyAttachment,
+  ]
+}
diff --git a/infrastructure/inventory/aws/scm-inventory/outputs.tf b/infrastructure/inventory/aws/scm-inventory/outputs.tf
new file mode 100644
index 0000000..a5c49f5
--- /dev/null
+++ b/infrastructure/inventory/aws/scm-inventory/outputs.tf
@@ -0,0 +1,11 @@
+output "ec2_role_arn" {
+  value = aws_iam_role.ec2_role.arn
+}
+
+output "ec2_instance_id" {
+  value = aws_instance.ec2_inventory.id
+}
+
+output "ec2_instance_arn" {
+  value = aws_instance.ec2_inventory.arn
+}
diff --git a/infrastructure/inventory/aws/scm-inventory/providers.tf b/infrastructure/inventory/aws/scm-inventory/providers.tf
new file mode 100644
index 0000000..8347460
--- /dev/null
+++ b/infrastructure/inventory/aws/scm-inventory/providers.tf
@@ -0,0 +1,22 @@
+terraform {
+  required_version = ">=1.7"
+
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 5.0"
+    }
+  }
+
+  backend "s3" {
+    encrypt = true
+  }
+}
+
+provider "aws" {
+  region  = var.aws_region
+  profile = var.aws_profile
+  default_tags {
+    tags = local.tags
+  }
+}
diff --git a/infrastructure/inventory/aws/scm-inventory/s3.tf b/infrastructure/inventory/aws/scm-inventory/s3.tf
new file mode 100644
index 0000000..117a0e4
--- /dev/null
+++ b/infrastructure/inventory/aws/scm-inventory/s3.tf
@@ -0,0 +1,27 @@
+data "aws_s3_bucket" "resources_and_results" {
+  bucket = var.s3_bucket_name
+}
+
+resource "null_resource" "poetry_build" {
+  provisioner "local-exec" {
+    command     = "poetry build -f sdist"
+    working_dir = "${var.inventory_project_dir}/"
+  }
+
+  triggers = {
+    always_run = timestamp()
+  }
+}
+
+data "local_file" "dist" {
+  filename   = "${var.inventory_project_dir}/dist/github_inventory-${var.project_version}.tar.gz"
+  depends_on = [null_resource.poetry_build]
+}
+
+resource "aws_s3_object" "poetry_dist" {
+  bucket      = data.aws_s3_bucket.resources_and_results.id
+  key         = "inventory/scripts/poetry_dist/github_inventory-${var.project_version}.tar.gz"
+  source      = "${var.inventory_project_dir}/dist/github_inventory-${var.project_version}.tar.gz"
+  source_hash = data.local_file.dist.content_sha256
+  depends_on  = [data.local_file.dist]
+}
diff --git a/infrastructure/inventory/aws/scm-inventory/s3.tfbackend b/infrastructure/inventory/aws/scm-inventory/s3.tfbackend
new file mode 100644
index 0000000..6fa4016
--- /dev/null
+++ b/infrastructure/inventory/aws/scm-inventory/s3.tfbackend
@@ -0,0 +1,5 @@
+bucket         = "<bucket_name>"
+key            = "<tf_state_s3_path>"
+region         = "<aws_region>"
+dynamodb_table = "<dynamodb_table>"
+profile        = "<aws_profile>"
diff --git a/infrastructure/inventory/aws/scm-inventory/secrets.tf b/infrastructure/inventory/aws/scm-inventory/secrets.tf
new file mode 100644
index 0000000..ae4a8be
--- /dev/null
+++ b/infrastructure/inventory/aws/scm-inventory/secrets.tf
@@ -0,0 +1,3 @@
+data "aws_secretsmanager_secret" "github_token_secret" {
+  name = var.github_token_secret_name
+}
diff --git a/infrastructure/inventory/aws/scm-inventory/security-groups.tf b/infrastructure/inventory/aws/scm-inventory/security-groups.tf
new file mode 100644
index 0000000..ad9bf79
--- /dev/null
+++ b/infrastructure/inventory/aws/scm-inventory/security-groups.tf
@@ -0,0 +1,21 @@
+data "aws_security_groups" "custom_security_groups" {
+  count = length(var.aws_default_security_groups_filters) > 0 ? 1 : 0
+
+  filter {
+    name   = "group-name"
+    values = var.aws_default_security_groups_filters
+  }
+
+  filter {
+    name   = "vpc-id"
+    values = [data.aws_vpc.selected.id]
+  }
+}
+
+# Data source for the default security group, always fetched but conditionally used
+data "aws_security_group" "default" {
+  count = length(var.aws_default_security_groups_filters) > 0 ? 0 : 1
+
+  vpc_id = data.aws_vpc.selected.id
+  name   = "default"
+}
diff --git a/infrastructure/inventory/aws/scm-inventory/sts.tf b/infrastructure/inventory/aws/scm-inventory/sts.tf
new file mode 100644
index 0000000..8fc4b38
--- /dev/null
+++ b/infrastructure/inventory/aws/scm-inventory/sts.tf
@@ -0,0 +1 @@
+data "aws_caller_identity" "current" {}
diff --git a/infrastructure/inventory/aws/scm-inventory/terraform.tfvars.example b/infrastructure/inventory/aws/scm-inventory/terraform.tfvars.example
new file mode 100644
index 0000000..d3e6502
--- /dev/null
+++ b/infrastructure/inventory/aws/scm-inventory/terraform.tfvars.example
@@ -0,0 +1,22 @@
+aws_region       = ""
+aws_profile      = ""
+environment_type = ""
+project_name     = ""
+github_token_secret_name = ""
+permissions_boundary_arn = ""
+s3_bucket_name           = ""
+scanned_org              = ""
+vpc_name                 = "vpc*"
+subnet_name              = "subnet_name*"
+tags = {
+  "mytag" = "tag"
+  "mytag2" = "tag2"
+}
+instance_type                       = ""
+ami_owner                           = ""
+ami_image_filter                    = ""
+aws_default_security_groups_filters = ["Security-Group-1*", "Security-Group-2*"]
+terminate_instance_after_completion = true
+s3_force_delete                     = false
+fetch_issues = false
+fetch_pr = false
diff --git a/infrastructure/inventory/aws/scm-inventory/variables.tf b/infrastructure/inventory/aws/scm-inventory/variables.tf
new file mode 100644
index 0000000..691e074
--- /dev/null
+++ b/infrastructure/inventory/aws/scm-inventory/variables.tf
@@ -0,0 +1,149 @@
+variable "aws_region" {
+  type        = string
+  default     = "us-east-1"
+  description = "AWS region where to deploy resources"
+
+  validation {
+    condition     = can(regex("^(af|ap|ca|eu|me|sa|us)-(central|north|(north(?:east|west))|south|south(?:east|west)|east|west)-\\d+$", var.aws_region))
+    error_message = "You should enter a valid AWS region (https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Concepts.RegionsAndAvailabilityZones.html)"
+  }
+}
+
+variable "aws_profile" {
+  type        = string
+  description = "AWS profile to use for authentication"
+}
+
+variable "project_name" {
+  type        = string
+  description = "Name of the project"
+  default     = "secrets-detection"
+}
+
+variable "environment_type" {
+  type        = string
+  default     = "PRODUCTION"
+  description = "Environment (PRODUCTION, PRE-PRODUCTION, QUALITY ASSURANCE, INTEGRATION TESTING, DEVELOPMENT, LAB)"
+
+  validation {
+    condition     = contains(["PRODUCTION", "PRE-PRODUCTION", "QUALITY ASSURANCE", "INTEGRATION TESTING", "DEVELOPMENT", "LAB"], var.environment_type)
+    error_message = "The environment type should be one of the following values: PRODUCTION, PRE-PRODUCTION, QUALITY ASSURANCE, INTEGRATION TESTING, DEVELOPMENT, LAB (case sensitive)"
+  }
+}
+
+variable "vpc_name" {
+  type        = string
+  default     = ""
+  description = "Filter to select the VPC to use, this can use wildcards."
+}
+
+variable "subnet_name" {
+  type        = string
+  default     = null
+  description = "Filter to select the subnet to use, this can use wildcards."
+}
+
+variable "s3_bucket_name" {
+  type        = string
+  description = "S3 bucket name where to upload the scripts and results"
+
+  validation {
+    condition     = can(regex("^[a-z0-9.-]{3,63}$", var.s3_bucket_name))
+    error_message = "The S3 bucket name must be a valid string with only a-z0-9.- characters and have a length between 3 and 63"
+  }
+}
+
+variable "github_token_secret_name" {
+  type        = string
+  description = "SSM parameter name containing the GitHub token of the Service Account"
+}
+
+
+variable "scanned_org" {
+  type        = string
+  description = "Name of the organization to scan"
+}
+
+variable "terminate_instance_after_completion" {
+  type        = bool
+  default     = true
+  description = "Indicates whether the instance should be terminated once the scan has finished (set to false for debugging purposes)"
+}
+
+variable "instance_type" {
+  type        = string
+  default     = "t2.micro"
+  description = "Instance type to use for fetching the inventory"
+}
+
+variable "tags" {
+  type        = map(string)
+  description = "A map of tags to add to the resources"
+  default     = {}
+
+  validation {
+    condition     = alltrue([for v in values(var.tags) : v != ""])
+    error_message = "Tag values must not be empty."
+  }
+}
+
+variable "ami_owner" {
+  type        = string
+  default     = "amazon"
+  description = "Owner of the Amazon Machine Image (AMI) to use for the EC2 instance"
+}
+
+
+variable "ami_image_filter" {
+  type        = string
+  default     = "amzn2-ami-hvm*"
+  description = "Filter to use to find the Amazon Machine Image (AMI) to use for the EC2 instance the name can contain wildcards. Only GNU/Linux images are supported."
+
+}
+
+
+variable "permissions_boundary_arn" {
+  type        = string
+  default     = null
+  description = "Permissions boundary to use for the IAM role"
+}
+
+
+variable "ec2_workdir" {
+  type        = string
+  default     = "~/github-inventory"
+  description = "Working directory for the EC2 instance"
+}
+
+variable "aws_default_security_groups_filters" {
+  type        = list(string)
+  default     = []
+  description = "Filters to use to find the default security groups"
+}
+
+
+variable "project_version" {
+  type        = string
+  default     = "0.1.0"
+  description = "Version of the project"
+}
+
+variable "inventory_project_dir" {
+  type        = string
+  default     = "../../../../scripts/inventory/github_inventory"
+  description = "Path to the directory containing the inventory project"
+
+}
+
+variable "fetch_pr" {
+  type        = bool
+  default     = false
+  description = "Indicates whether to fetch pull requests for the repositories"
+}
+
+variable "fetch_issues" {
+  type        = bool
+  default     = false
+  description = "Indicates whether to fetch issues for the repositories"
+
+}
diff --git a/infrastructure/inventory/aws/scm-inventory/vpc.tf b/infrastructure/inventory/aws/scm-inventory/vpc.tf
new file mode 100644
index 0000000..cf2578a
--- /dev/null
+++ b/infrastructure/inventory/aws/scm-inventory/vpc.tf
@@ -0,0 +1,17 @@
+data "aws_vpc" "selected" {
+  filter {
+    name   = "tag:Name"
+    values = [var.vpc_name]
+  }
+}
+
+data "aws_subnets" "default" {
+  filter {
+    name   = "tag:Name"
+    values = [var.subnet_name]
+  }
+}
+
+data "aws_subnet" "selected" {
+  id = element(sort(data.aws_subnets.default.ids), 0)
+}

From 8526401081b9fce61f67bee1660b6bb239aad220 Mon Sep 17 00:00:00 2001
From: Yassine Ilmi <Yassine.Ilmi@thomsonreuters.com>
Date: Thu, 27 Jun 2024 00:10:39 +0100
Subject: [PATCH 3/3] Updating main readme.md

---
 README.md | 47 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 82527d9..2f73356 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,51 @@
+markdown
 # Secrets Finder
 
 ![Contributor Covenant Badge 2.1](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg)
 
+Welcome to the Secrets Finder project! This repository contains tools and infrastructure to support organizations in rolling out their own secrets detection and prevention programs, focusing on scanning source code repositories. Our solution leverages various open-source tools and cloud services to provide automated, scheduled, and event-based scanning capabilities.
 
-This repository contains the code for the secrets-finder, an initiative that aims to provide to organizations and users scheduled and automated secrets scanning capabilities in source code repositories.
+## Overview
+
+Secrets Finder is designed to help organizations manage and detect secrets in their codebases. The project includes modules for both ongoing and scheduled scans, utilizing tools like [TruffleHog](https://github.com/trufflesecurity/trufflehog) and cloud providers such as AWS, with some features integrated with GitHub.
+
+> **Note**: This project is a work in progress and is production-ready for the currently supported technologies. We are actively working on adding support for more integrations with cloud providers and source code management (SCM) systems. We welcome contributions and feedback from the community.
+
+Some of the tools can be used directly from a workstation, while others require cloud infrastructure to be set up. The project includes Terraform scripts to automate the deployment of the necessary infrastructure.
+
+### Key Features:
+- **Secrets Management and Storage**: Manages secrets using AWS Secrets Manager and S3 for secure storage and access. For more details, see [Secrets Management README](infrastructure/secrets-finder/setup/aws/secrets/README.md) and [Storage README](infrastructure/secrets-finder/setup/aws/storage/README.md).
+- **Database Migrations**: Manages database migrations using Alembic for SQLite, Postgres, MariaDB, and others. This component handles database schema updates, allowing for safe schema extensions. For more details, see [Migrations README](migrations/README.md).
+- **Ingestion Infrastructure**: Sets up infrastructure for data ingestion using AWS services to ingest data from various sources such as scans, jobs, findings, inventory, and issues. For more details, see [Ingestion Infrastructure README](infrastructure/ingestion/aws/README.md).
+- **Ongoing Scans**: Provides infrastructure for ongoing scans of GitHub repositories. This uses various components such as a GitHub Apps, an API Gateway, AWS Lambdas, and CloudFront. This type of scan monitors events in your GitHub repositories and, in the event of a secret detection, will comment on the pull request, or create an issue for pushes to the default branch. For public repositories, the visibility of the repository could aslo be changed automatically. For more details, see [Ongoing Scans README](infrastructure/secrets-finder/ongoing-scans/README.md).
+- **Scheduled Scans**: Provides infrastructure for scheduled scans of git-based repositories, supporting multiple SCM platforms. This helps you scan your repositories regularly and ingests the findings allowing you to establish the baseline for your program. For more details, see [Scheduled Scans README](infrastructure/secrets-finder/scheduled-scans/aws/README.md).
+- **Automated SCM Inventory**: Supports the deployment of resources to fetch your SCM inventory which includes various metadata for a repository as well as issues, pull requests, languages and topics. This lays the ground for, e.g., more efficient scheduled scanning by supporting incremental scans and only scan repositories changed. For more details, see [SCM Inventory README](infrastructure/inventory/aws/scm-inventory/README.md) and For more details, see [GitHub Inventory README](scripts/inventory/github_inventory/README.md).
+
+## Getting Started
+
+### Prerequisites
+- Access to an AWS principal with permissions to create necessary resources (see individual modules for details)
+- SCM token with required permissions for accessing repositories during scans
+
+### Usage
+
+While a Readme file is provided for each module with more detailed instructions on how to each module, here are some of the general steps to get started:
+
+1. **Configure AWS Credentials**: Ensure your AWS CLI is configured with credentials that have the necessary permissions.
+2. **Prepare your SCM tokens**: Either store directly the SCM secrets (GitHub or other SCM tokens) in AWS Secrets Manager. Or use the secrets module to manage and expose them to the various automation pieces.
+3. **Create a Terraform State S3 Bucket**: Create an S3 bucket to store the various modules' state files and update the `s3.tfbackend` files in each module.
+4. **Set Terraform Variables**: Provide a `terraform.tfvars` file setting the required variables or customizing some of the default values provided.
+Provides infrastructure for scheduled scans of git-based repositories.
+
+
+## Contributing
+
+We welcome contributions! Please see our [Contributing Guidelines](docs/CONTRIBUTING.md) for more information on how to get involved.
+
+## License
+
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+
+## Support
+
+If you have any questions or need help, please use the feel free to open an issue or contact the maintainers.