configuration, build, and deployment scripts

newscorp-ghfb · Jul 8, 2019 · 14568cd · 14568cd
1 parent 34c438e
commit 14568cd
Show file tree

Hide file tree

Showing 9 changed files with 687 additions and 0 deletions.
diff --git a/appspec.yml b/appspec.yml
@@ -0,0 +1,15 @@
+version: 0.0
+os: linux
+files:
+  - source: collectorApp
+    destination: collectorApp
+hooks:
+  BeforeInstall:
+    - location: collectorApp/scripts/kill_previous_one.sh
+      timeout: 40
+  ApplicationStart:
+    - location: collectorApp/scripts/start_collector.sh
+      timeout: 40
+  ValidateService:
+    - location: collectorApp/scripts/health_check.sh
+      timeout: 70
diff --git a/build.sh b/build.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+
+WORKDIR="$(dirname "$0")"
+JAR_FILE="2-collectors/scala-stream-collector/kinesis/target/scala-2.11/snowplow-stream-collector-kinesis-0.15.0.jar"
+
+cd "$WORKDIR"
+mkdir -p target/common
+
+# TODO: include ivy-cache 
+
+# change into scala-stream-collector directory
+cd 2-collectors/scala-stream-collector 
+
+# build jar
+if [ -z "$1" ]
+then
+	sbt "project kinesis" assembly
+else
+	sbt -Dsbt.ivy.home=$1 "project kinesis" assembly
+fi
+
+cd ../..
+cp -R $JAR_FILE target/common/snowplow-stream-collector.jar
+echo "jar copied to target/common"
+
+cp -r scripts target/common/
+
+mkdir -p target/us/collectorApp/config
+mkdir -p target/sit/collectorApp/config
+
+cp -R target/common/* target/us/collectorApp/
+# Using the tmp direcotry creation so that it works on MacOs and Linux
+sed -e 's/collector-THISWILLCHANGE-stdout.log/collector-usprod-stdout.log/g' target/us/collectorApp/scripts/start_collector.sh >tmp_1.sh
+mv tmp_1.sh target/us/collectorApp/scripts/start_collector.sh
+cp config/collector-us.conf target/us/collectorApp/config/collector.conf
+cp appspec.yml target/us/
+cp deploy_aws_code_deploy_revision.sh target/us/
+cd target/us
+tar -cvf ../collector-us.zip *
+cd ../..
+
+cp -R target/common/* target/sit/collectorApp/
+# Using the tmp direcotry creation so that it works on MacOs and Linux
+sed -e 's/collector-THISWILLCHANGE-stdout.log/collector-ausit-stdout.log/g' target/sit/collectorApp/scripts/start_collector.sh >tmp_1.sh
+mv tmp_1.sh target/sit/collectorApp/scripts/start_collector.sh
+cp config/collector-sit.conf target/sit/collectorApp/config/collector.conf
+cp appspec.yml target/sit/
+cp deploy_aws_code_deploy_revision.sh target/sit/
+cd target/sit
+tar -cvf ../collector-sit.zip *
+cd ../..
diff --git a/config/collector-sit.conf b/config/collector-sit.conf
@@ -0,0 +1,163 @@
+# Copyright (c) 2013-2017 Snowplow Analytics Ltd. All rights reserved.
+#
+# This program is licensed to you under the Apache License Version 2.0, and
+# you may not use this file except in compliance with the Apache License
+# Version 2.0.  You may obtain a copy of the Apache License Version 2.0 at
+# http://www.apache.org/licenses/LICENSE-2.0.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the Apache License Version 2.0 is distributed on an "AS
+# IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.  See the Apache License Version 2.0 for the specific language
+# governing permissions and limitations there under.
+
+# This file (application.conf.example) contains a template with
+# configuration options for the Scala Stream Collector.
+#
+# To use, copy this to 'application.conf' and modify the configuration options.
+
+# 'collector' contains configuration options for the main Scala collector.
+collector {
+  # The collector runs as a web service specified on the following interface and port.
+  interface = "0.0.0.0"
+  port = 5000
+
+  # Configure the P3P policy header.
+  p3p {
+    policyRef = "/w3c/p3p.xml"
+    CP = "NOI DSP COR NID PSA OUR IND COM NAV STA"
+  }
+
+  crossDomain {
+    enabled = false
+    domain = "*"
+    secure = true
+  }
+
+  # The collector returns a cookie to clients for user identification
+  # with the following domain and expiration.
+  cookie {
+    enabled = true
+    expiration = "365 days"
+    # Network cookie name
+    name = sp
+    # The domain is optional and will make the cookie accessible to other
+    # applications on the domain. Comment out this line to tie cookies to
+    # the collector's full domain
+    domain = ".newscgp.com"
+  }
+
+  # When enabled and the cookie specified above is missing, performs a redirect to itself to check
+  # if third-party cookies are blocked using the specified name. If they are indeed blocked,
+  # fallbackNetworkId is used instead of generating a new random one.
+  cookieBounce {
+    enabled = false
+    # The name of the request parameter which will be used on redirects checking that third-party
+    # cookies work.
+    name = "n3pc"
+    # Network user id to fallback to when third-party cookies are blocked.
+    fallbackNetworkUserId = "00000000-0000-4000-A000-000000000000"
+    # Optionally, specify the name of the header containing the originating protocol for use in the
+    # bounce redirect location. Use this if behind a load balancer that performs SSL termination.
+    # The value of this header must be http or https. Example, if behind an AWS Classic ELB.
+    forwardedProtocolHeader = "X-Forwarded-Proto"
+  }
+
+  # When enabled, the redirect url passed via the `u` query parameter is scanned for a placeholder
+  # token. All instances of that token are replaced withe the network ID. If the placeholder isn't
+  # specified, the default value is `${SP_NUID}`.
+  redirectMacro {
+    enabled = false
+    # Optional custom placeholder token (defaults to the literal `${SP_NUID}`)
+    placeholder = "[TOKEN]"
+  }
+
+  streams {
+    # Events which have successfully been collected will be stored in the good stream/topic
+    good = "ncg-sit-raw-good"
+
+    # Events that are too big (w.r.t Kinesis 1MB limit) will be stored in the bad stream/topic
+    bad = "ncg-sit-raw-bad"
+
+    # Whether to use the incoming event's ip as the partition key for the good stream/topic
+    # Note: Nsq does not make use of partition key.
+    useIpAddressAsPartitionKey = false
+
+    # Enable the chosen sink by uncommenting the appropriate configuration
+    sink {
+      # Choose between kinesis, kafka, nsq, or stdout.
+      # To use stdout, comment or remove everything in the "collector.streams.sink" section except
+      # "enabled" which should be set to "stdout".
+      enabled = kinesis
+
+      # Region where the streams are located
+      region = "ap-southeast-2"
+
+      # Thread pool size for Kinesis API requests
+      threadPoolSize = 30
+
+      # The following are used to authenticate for the Amazon Kinesis sink.
+      # If both are set to 'default', the default provider chain is used
+      # (see http://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/DefaultAWSCredentialsProviderChain.html)
+      # If both are set to 'iam', use AWS IAM Roles to provision credentials.
+      # If both are set to 'env', use environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
+      aws {
+        accessKey = iam
+        secretKey = iam
+      }
+
+      # Minimum and maximum backoff periods
+      backoffPolicy {
+        minBackoff = 3000   # 3 seconds
+        maxBackoff = 600000 # 5 minutes
+      }
+
+      # Or Kafka
+      #brokers = "{{kafkaBrokers}}"
+      ## Number of retries to perform before giving up on sending a record
+      #retries = 0
+
+      # Or NSQ
+      ## Host name for nsqd
+      #host = "{{nsqHost}}"
+      ## TCP port for nsqd, 4150 by default
+      #port = {{nsqdPort}}
+    }
+
+    # Incoming events are stored in a buffer before being sent to Kinesis/Kafka.
+    # Note: Buffering is not supported by NSQ.
+    # The buffer is emptied whenever:
+    # - the number of stored records reaches record-limit or
+    # - the combined size of the stored records reaches byte-limit or
+    # - the time in milliseconds since the buffer was last emptied reaches time-limit
+    buffer {
+      byteLimit = 400000
+      recordLimit = 500
+      timeLimit = 5000
+    }
+  }
+}
+
+# Akka has a variety of possible configuration options defined at
+# http://doc.akka.io/docs/akka/current/scala/general/configuration.html
+akka {
+  loglevel = OFF # 'OFF' for no logging, 'DEBUG' for all logging.
+  loggers = ["akka.event.slf4j.Slf4jLogger"]
+
+  # akka-http is the server the Stream collector uses and has configurable options defined at
+  # http://doc.akka.io/docs/akka-http/current/scala/http/configuration.html
+  http.server {
+    # To obtain the hostname in the collector, the 'remote-address' header
+    # should be set. By default, this is disabled, and enabling it
+    # adds the 'Remote-Address' header to every request automatically.
+    remote-address-header = on
+
+    raw-request-uri-header = on
+
+    # Define the maximum request length (the default is 2048)
+    parsing {
+      max-uri-length = 32768
+      uri-parsing-mode = relaxed
+    }
+  }
+}
diff --git a/config/collector-stdout.conf b/config/collector-stdout.conf
@@ -0,0 +1,145 @@
+# This conf was created to use in local dev setup, The output of the collector goes to stdout.
+collector {
+  # The collector runs as a web service specified on the following interface and port.
+  interface = "0.0.0.0"
+  port = 5000
+
+  # Configure the P3P policy header.
+  p3p {
+    policyRef = "/w3c/p3p.xml"
+    CP = "NOI DSP COR NID PSA OUR IND COM NAV STA"
+  }
+
+  crossDomain {
+    enabled = false
+    domain = "*"
+    secure = true
+  }
+
+  # The collector returns a cookie to clients for user identification
+  # with the following domain and expiration.
+  cookie {
+    enabled = true
+    expiration = "365 days"
+    # Network cookie name
+    name = sp
+    # The domain is optional and will make the cookie accessible to other
+    # applications on the domain. Comment out this line to tie cookies to
+    # the collector's full domain
+    domain = ".newscgp.com"
+  }
+
+  # When enabled and the cookie specified above is missing, performs a redirect to itself to check
+  # if third-party cookies are blocked using the specified name. If they are indeed blocked,
+  # fallbackNetworkId is used instead of generating a new random one.
+  cookieBounce {
+    enabled = false
+    # The name of the request parameter which will be used on redirects checking that third-party
+    # cookies work.
+    name = "n3pc"
+    # Network user id to fallback to when third-party cookies are blocked.
+    fallbackNetworkUserId = "00000000-0000-4000-A000-000000000000"
+    # Optionally, specify the name of the header containing the originating protocol for use in the
+    # bounce redirect location. Use this if behind a load balancer that performs SSL termination.
+    # The value of this header must be http or https. Example, if behind an AWS Classic ELB.
+    forwardedProtocolHeader = "X-Forwarded-Proto"
+  }
+
+  # When enabled, the redirect url passed via the `u` query parameter is scanned for a placeholder
+  # token. All instances of that token are replaced withe the network ID. If the placeholder isn't
+  # specified, the default value is `${SP_NUID}`.
+  redirectMacro {
+    enabled = false
+    # Optional custom placeholder token (defaults to the literal `${SP_NUID}`)
+    placeholder = "[TOKEN]"
+  }
+
+  streams {
+    # Events which have successfully been collected will be stored in the good stream/topic
+    good = "ncg-uat-raw-good"
+
+    # Events that are too big (w.r.t Kinesis 1MB limit) will be stored in the bad stream/topic
+    bad = "ncg-uat-raw-bad"
+
+    # Whether to use the incoming event's ip as the partition key for the good stream/topic
+    # Note: Nsq does not make use of partition key.
+    useIpAddressAsPartitionKey = false
+
+    # Enable the chosen sink by uncommenting the appropriate configuration
+    sink {
+      # Choose between kinesis, kafka, nsq, or stdout.
+      # To use stdout, comment or remove everything in the "collector.streams.sink" section except
+      # "enabled" which should be set to "stdout".
+      enabled = "stdout"
+
+      # Region where the streams are located
+      # region = "ap-southeast-2"
+
+      # Thread pool size for Kinesis API requests
+      # threadPoolSize = 30
+
+      # The following are used to authenticate for the Amazon Kinesis sink.
+      # If both are set to 'default', the default provider chain is used
+      # (see http://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/DefaultAWSCredentialsProviderChain.html)
+      # If both are set to 'iam', use AWS IAM Roles to provision credentials.
+      # If both are set to 'env', use environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
+      # aws {
+      #  accessKey = iam
+      #  secretKey = iam
+      # }
+
+      # Minimum and maximum backoff periods
+      # backoffPolicy {
+      #  minBackoff = 3000   # 3 seconds
+      #  maxBackoff = 600000 # 5 minutes
+      # }
+
+      # Or Kafka
+      #brokers = "{{kafkaBrokers}}"
+      ## Number of retries to perform before giving up on sending a record
+      #retries = 0
+
+      # Or NSQ
+      ## Host name for nsqd
+      #host = "{{nsqHost}}"
+      ## TCP port for nsqd, 4150 by default
+      #port = {{nsqdPort}}
+    }
+
+    # Incoming events are stored in a buffer before being sent to Kinesis/Kafka.
+    # Note: Buffering is not supported by NSQ.
+    # The buffer is emptied whenever:
+    # - the number of stored records reaches record-limit or
+    # - the combined size of the stored records reaches byte-limit or
+    # - the time in milliseconds since the buffer was last emptied reaches time-limit
+    buffer {
+      byteLimit = 400000
+      recordLimit = 500
+      timeLimit = 5000
+    }
+  }
+}
+
+# Akka has a variety of possible configuration options defined at
+# http://doc.akka.io/docs/akka/current/scala/general/configuration.html
+akka {
+  loglevel = OFF # 'OFF' for no logging, 'DEBUG' for all logging.
+  loggers = ["akka.event.slf4j.Slf4jLogger"]
+
+  # akka-http is the server the Stream collector uses and has configurable options defined at
+  # http://doc.akka.io/docs/akka-http/current/scala/http/configuration.html
+  http.server {
+    # To obtain the hostname in the collector, the 'remote-address' header
+    # should be set. By default, this is disabled, and enabling it
+    # adds the 'Remote-Address' header to every request automatically.
+    remote-address-header = on
+
+    raw-request-uri-header = on
+
+    # Define the maximum request length (the default is 2048)
+    parsing {
+      max-uri-length = 32768
+      uri-parsing-mode = relaxed
+    }
+  }
+}