Add support to pull datapoints from Kafka

elukey · elukey · commit 35e781cfd03c · 2020-06-15T08:18:16.000+02:00
This change introduces a separate thread that is able to pull data from a Kafka topic and insert datapoints in the shared queue that separate ingestion from processing. The idea is to set Druid daemons that need to push hundreds of datapoints/s to Kafka (via the KafkaEmitter) and collect them at a slower pace in the Exporter. GH issue: #11 Change-Id: Ibc82be5883f20c26b50342d2032381086bcd218a
diff --git a/README.md b/README.md
@@ -57,6 +57,14 @@ The druid prometheus exporter accepts HTTP POST data, inspects it and stores/agg
 every supported datapoint into a data structure. It then formats the
 data on the fly to Prometheus metrics when a GET /metrics is requested.
 
+The exporter is also able to pull metrics from a Kafka topic, when Druid is configured to do so:
+
+https://druid.apache.org/docs/latest/development/extensions-contrib/kafka-emitter.html
+
+The Kafka support is optional and should be used when the number of metrics/datapoints emitted
+by Druid is very high (rule of thumb could be around 1000 datapoints/s). Please check the section
+about performance considerations below for more info.
+
 This exporter is supposed to be run on each host running one or multiple Druid daemons.
 
 ## Supported metrics and labels
@@ -117,14 +125,16 @@ requests are welcome!
 
 In https://github.com/wikimedia/operations-software-druid_exporter/issues/11 some users
 brought up an interesting use case for the exporter, namely handling peaks of 1500 datapoints/s
-sent from Druid brokers. The exporter's code was refactored to be able to scale more
+sent from Druid brokers (one can check the rate of datapoints/s via the
+`druid_exporter_datapoints_registered_total metric`).
+
+The exporter's code was refactored to be able to scale more
 (if you are curious, check [this commit](https://github.com/wikimedia/operations-software-druid_exporter/commit/f22c6d9f8707ae2d274db9b10669b971beed64ab)), but it wasn't enough, since users kept reporting timeouts from Druid daemons while sending
 datapoints to the exporter. There are some recommendations to follow:
 * Try to use a dedicated exporter instance for daemons sending high volumes of datapoints.
 * Try to tune the HTTP emitter's settings via https://druid.apache.org/docs/latest/configuration/index.html#http-emitter-module
   (if the Druid version that you are running supports them).
-On our side, we are working on a solution that should be simple and flexible, namely use the
-[https://druid.apache.org/docs/latest/development/extensions-contrib/kafka-emitter.html](Kafka)
-emitter instead of the HTTP emitter. The idea is to instruct Druid daemons to push datapoints
-to a Kafka topic, and then to point the exporter to it. The code is not ready yet,
-please refer to the aforementioned issue for updates.
+If the above is not enough, the exporter can be configured to also pull datapoints from
+a Kafka topic (see [https://druid.apache.org/docs/latest/development/extensions-contrib/kafka-emitter.html](Kafka)). With this configuration, the exporter will ingest datapoints coming via
+HTTP and Kafka at the same time. An ideal solution is to force Druid daemons emitting too many
+datapoints/s to use the KafkaEmitter, and the other ones to use the HTTPEmitter.
diff --git a/druid_exporter/collector.py b/druid_exporter/collector.py
@@ -24,12 +24,17 @@
 
 log = logging.getLogger(__name__)
 
+try:
+    from kafka import KafkaConsumer
+except ImportError:
+    KafkaConsumer = None
+
 
 class DruidCollector(object):
     scrape_duration = Summary(
             'druid_scrape_duration_seconds', 'Druid scrape duration')
 
-    def __init__(self, metrics_config):
+    def __init__(self, metrics_config, kafka_config=None):
 
         # The ingestion of the datapoints is separated from their processing,
         # to separate concerns and avoid unnecessary slowdowns for Druid
@@ -41,6 +46,17 @@ def __init__(self, metrics_config):
         self.datapoints_queue = queue.Queue()
         threading.Thread(target=self.process_queued_datapoints).start()
 
+        # if a Kafka config is provided, create a dedicated thread
+        # that pulls datapoints from a Kafka topic.
+        # The thread will then push datapoints to the same queue that
+        # the HTTP server uses. In this way the exporter allows a mixed
+        # configuration for Druid Brokers between HTTPEmitter and
+        # KafkaEmitter (for daemons emitting too many datapoints/s).
+        if kafka_config and KafkaConsumer:
+            threading.Thread(
+                target=self.pull_datapoints_from_kafka,
+                args=(kafka_config,)).start()
+
         # Datapoints successfully registered
         self.datapoints_registered = 0
 
@@ -255,3 +271,25 @@ def process_queued_datapoints(self):
                 self.store_counter(datapoint)
 
             self.datapoints_registered += 1
+
+    def pull_datapoints_from_kafka(self, kafka_config):
+        consumer = KafkaConsumer(
+            kafka_config['topic'],
+            group_id=kafka_config['group_id'],
+            bootstrap_servers=kafka_config['bootstrap_servers'])
+
+        while True:
+            consumer.poll()
+            for message in consumer:
+                try:
+                    json_message = json.loads(message.value.decode())
+                    log.debug('Datapoint from kafka: %s', json_message)
+                    if type(json_message) == list:
+                        for datapoint in json_message:
+                            self.register_datapoint(datapoint)
+                    else:
+                        self.register_datapoint(json_message)
+                except json.JSONDecodeError:
+                    log.exception("Failed to decode message from Kafka, skipping..")
+                except Exception as e:
+                    log.exception("Generic exception while pulling datapoints from Kafka")
diff --git a/druid_exporter/exporter.py b/druid_exporter/exporter.py
@@ -122,13 +122,42 @@ def main():
                         help='Enable debug logging')
     parser.add_argument('-e', '--encoding', default='utf-8',
                         help='Encoding of the Druid POST JSON data.')
+    kafka_parser = parser.add_argument_group('kafka',
+                                             'Optional configuration for datapoints emitted '
+                                             'to a topic via the Druid Kafka Emitter extension.')
+    kafka_parser.add_argument('-t', '--kafka-topic',
+                              help='Pull datapoints from a given Kafka topic.')
+    kafka_parser.add_argument('-b', '--kafka-bootstrap-servers', nargs='+',
+                              help='Pull datapoints from a given list of Kafka brokers.')
+    kafka_parser.add_argument('-g', '--kafka-consumer-group-id',
+                              help='Pull datapoints from Kafka using this Consumer group id.')
+
     args = parser.parse_args()
 
     if args.debug:
         logging.basicConfig(level=logging.DEBUG)
     else:
         logging.basicConfig(level=logging.WARNING)
 
+    kafka_args = (args.kafka_topic,
+                  args.kafka_bootstrap_servers,
+                  args.kafka_consumer_group_id)
+
+    # Check if a Kafka config is provided
+    if any(kafka_args):
+        if not all(kafka_args):
+            argparse.ArgumentParser.error('Kafka configuration incomplete, '
+                                          'please provide a topic, one or more brokers '
+                                          'as bootstrap-servers and the consumer group id.')
+        else:
+            kafka_config = {}
+            kafka_config['topic'] = args.kafka_topic
+            kafka_config['bootstrap_servers'] = args.kafka_bootstrap_servers
+            kafka_config['group_id'] = args.kafka_consumer_group_id
+            log.info('Using Kafka config: {}'.format(kafka_config))
+    else:
+        kafka_config = None
+
     collect_metrics_from = []
 
     address, port = args.listen.split(':', 1)
@@ -143,7 +172,8 @@ def main():
     log.info('Checking consistency of metrics config file..')
     check_metrics_config_file_consistency(metrics_config)
 
-    druid_collector = collector.DruidCollector(metrics_config)
+    druid_collector = collector.DruidCollector(
+        metrics_config, kafka_config)
     REGISTRY.register(druid_collector)
     prometheus_app = make_wsgi_app()
     druid_wsgi_app = DruidWSGIApp(args.uri, druid_collector,
diff --git a/setup.py b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup
 
 setup(name='druid_exporter',
-      version='0.9',
+      version='0.10',
       description='Prometheus exporter for Druid',
       url='https://github.com/wikimedia/operations-software-druid_exporter',
       author='Luca Toscano',
@@ -12,6 +12,9 @@
           'prometheus-client>=0.5.0',
           'gevent',
       ],
+      extras_require = {
+          'kafka': ['kafka-python']
+      },
       entry_points={
           'console_scripts': [
               'druid_exporter = druid_exporter.exporter:main'