diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 42c991a..1dff4b1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -17,10 +17,10 @@ jobs: key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} restore-keys: | ${{ runner.os }}-maven- - - name: Set up JDK 12 + - name: Set up JDK 21 uses: actions/setup-java@main with: - java-version: 12 + java-version: 21 distribution: adopt - name: Build and test project env: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 7feb068..d931264 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -17,10 +17,10 @@ jobs: key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} restore-keys: | ${{ runner.os }}-maven- - - name: Set up JDK 12 + - name: Set up JDK 21 uses: actions/setup-java@main with: - java-version: 12 + java-version: 21 distribution: adopt - name: Build and test project env: diff --git a/LICENSE_COMMERCIAL b/LICENSE_COMMERCIAL index 6d53567..9347e9d 100644 --- a/LICENSE_COMMERCIAL +++ b/LICENSE_COMMERCIAL @@ -3,18 +3,18 @@ Preamble - The Commercial License is an addendum to the GPL license - see + The Commercial License is an addendum to the GPL license - see LICENSE in the same folder - superseding certain license related aspects. It applies to productive environments only, not Test, QA or -development systems. For those, the GPL license applies providing +development systems. For those, the GPL license applies, providing a free usage license. - The main purpose of the commercial license is to bill for the + The main purpose of the commercial license is to bill for the usage of the Program on a monthly basis, thus allowing further development. The key metric used is the number of messages processed by the -Program per month. +program (sum of all instances) per month. PAY-PER-USE @@ -29,14 +29,19 @@ Programs are billed according to the following table Example 1: In a test environment the Program is evaluated for its performance over a long period of time. ---> The Program is used under the GPL license as it is executed in a +--> The program is used under the GPL license as it is executed in a test environment, not a production system, hence no fees occur. -Example 2: The Program is used to copy productive data, the amount +Example 2: The program is used to copy productive data, the amount of rows copied is 50'000rows per month. --> The program is used under this Commercial License but the fee is 0EUR according to above table. No contract needs to be signed, -no invoices will be created. +no invoices will be created. + +Example 3: The program is started 20 times to process different Kafka +topics and partitions at speed, processing 5m records per month in +total. --> The program is used under this Commercial License and +100EUR/month are to be invoiced. Once an order is placed for a defined volume, rtdi.io GmbH (Austria, Europe), as the copyright owner, will send invoices on a monthly basis @@ -53,7 +58,7 @@ GDPR - General Data Protection and Privacy - information) DIFFERENCES TO SOFTWARE LICENSES Unlike classic software licenses, where license and support contract -go in tandem, the monthly usage is charged and the Program provided +go in tandem, the monthly usage is charged and the program provided as-is. As consequence no support and no warranties are provided. In this regards all clauses of the GPL apply, specifically but not exclusive Section 15 and Section 16. diff --git a/README.md b/README.md index 7c13166..2460008 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ _Validate incoming messages and augment them with rule testing results_ Source code available here: [github](https://github.com/rtdi/RTDIRulesService) + Docker image here: [dockerhub](https://hub.docker.com/r/rtdi/rulesservice) ## Design Thinking goal @@ -12,26 +13,32 @@ Docker image here: [dockerhub](https://hub.docker.com/r/rtdi/rulesservice) * Operational dashboards using the rule results provide information about the data quality * Different types of rules should be supported, validation rules, cleansing rules, data augmentation, standardization rules,... +## Requirements + +* Payload (value) in Avro Format +* Apache Kafka connection with the permissions to run as a KStream +* Schema Registry connection to read (and write) schema definitions + ## Installation and testing -On any computer install the Docker Daemon - if it is not already - and download this docker image with +On any computer install the Docker Daemon - if not already done - and download the docker image with docker pull rtdi/rulesservice -Then start the image via docker run. For a quick test this command is sufficient +Then start the image via docker run. For a quick test this command is sufficient... docker run -d -p 80:8080 --rm --name rulesservice rtdi/rulesservice to expose a webserver at port 80 on the host running the container. Make sure to open the web page via the http prefix, as https needs more configuration. -For example [http://localhost:80/](http://localhost:80/) might do the trick of the container is hosted on the same computer. +For example [http://localhost:80/](http://localhost:80/) might do the trick if the container is hosted on the same computer. The default login for this startup method is: **rtdi / rtdi!io** -The probably better start command is to mount two host directories into the container, the rtdiconfig directory where all settings made when configuring the connector will be stored permanently and the security directory for web server specific settings like user database and SSL certificates. +The better start command is to mount two host directories into the container, the rtdiconfig directory where all settings made when configuring the connector will be stored permanently and the security directory for web server specific settings like user database and SSL certificates. - docker run -d -p 80:8080 -p 443:8443 --rm -v /data/files:/data/ \ - -v /home/dir/rtdiconfig:/usr/local/tomcat/conf/rtdiconfig \ + docker run -d -p 80:8080 -p 443:8443 --rm \ + -v /home/dir/rulesservice:/apps/rulesservice \ -v /home/dir/security:/usr/local/tomcat/conf/security \ --name rulesservice rtdi/rulesservice @@ -39,73 +46,149 @@ The probably better start command is to mount two host directories into the cont For proper start commands, especially https and security related, see the [ConnectorRootApp](https://github.com/rtdi/ConnectorRootApp) project, this application is based on. - -### Connect the Pipeline to Kafka +### Step 1: Connect to Kafka + +The first step is to connect the application to a Kafka server and the schema registry. In the settings screen the normal Kafka properties file data can be pasted and saved. By default the file location is `/apps/rulesservice/settings/kafka.properties` from the container's point of view. + + + + +``` +basic.auth.credentials.source=USER_INFO +schema.registry.basic.auth.user.info=XXXXXXXXXXXXXXXX:XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX +schema.registry.url=https://XXXXXXXXXX.eu-central-1.aws.confluent.cloud + +bootstrap.servers=XXXXXXXXX.eu-central-1.aws.confluent.cloud:9092 +sasl.jaas.config=org.apache.kafka.common.security.plain.PlainLoginModule \ +required username="XXXXXXXXXXXXXXXX" password="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"; +security.protocol=SASL_SSL +sasl.mechanism=PLAIN +``` + + +### Step 2: Define Rules + +A rule file applies logic to messages of a given schema/subject. It consumes the input message, applies the rules and creates an output message with a derived schema - a schema that contains the rule results as additional field. What the input and output topic is, will be configured in the next step. + +Rule files are saved by default in the directory `/apps/rulesservice/definitions//inactive>/....` and the file name itself can contain sub-directories. + +To simplify entering rules, sample values can be entered and the result be recalculated. A quick way to provide sample values is by reading topics and creating sample files with the content - see below. These files, with messages of the same subject name, can be selected and hence used as input. + +Once a rule file is complete, it must be copied from the `inactive` to the `active` directory. The button `Activate` does that. The reason for this two staged approach is to allow users saving intermediate definitions without impacting the currently running service. + + + + +### Step 3: Topics + +An input topic can contain messages of different subjects, hence the dialog asks what input topic should cater which rule files (remember, a rule file work on a specific subject) and what the output topic should be. +Scaling is achieved by increasing the number of KStream instances used for this topic or by spinning up more containers with the same settings. + +The screen also allows to copy the rule files being used into the active folder to simplify activating each from the rule file dialog. -The first step is to connect the application to a Kafka server, in this example Confluent Cloud. + - +### Result +If the input schema has an `_audit` field, it is assumed the schema contains the structure for the rule results already. This would be a preferred case, because input schema = output schema. +In all other cases the input schema's latest version is read from the schema registry and the additional `_audit` structure is being created. This will be the output schema. -### Define Services +The reason for using the latest is because of schema evolution scenarios. It might happen that schema id 2 has an additional field `NAME` compared to schema id 1. So the subject evolved from schema 1 to 2. The KStream does receive a message with schema 2 first, adds the `_audit` field and it is saved in the schema registry. The next message has an input schema 1 and if that would get registered as output schema next, it would fail due to the missing `NAME` field. Hence both must be outputted with the latest schema version always. This also explains why adding the `_audit` on the original input schema already is preferred. +The overall rule result is stored (did it pass all tests?), a list of all rules executed and their individual results. +Querying this data allows detailed reporting which records were processed by what rule and the results. -Each Service is a Kafka KStream, a distributed process listening on a topic and validating the data. Hence the first setting of each service are the input and output topic names to use. +The exact Avro schema field definition can be found [here](docs/audit-schema.md) -Within one rule service for each schema multiple steps can be performed, a microservice transformation so to speak. These transformation steps happen within the service. For example the first microservice step might check missing data, the next standardize on the different spellings. The result of this step is then put into a third step, validating if the STATUS is valid and consistent with other data. + - -Once the structure is saved, by clicking on the individual steps the rules themselves are defined. +### Sample files -Because a topic can contain data for multiple schemas, the rules are defined for each schema individually. If the message schema has no rules defined, it is passed through unchanged. +To create sample files, one or multiple topics are selected in the screen and executed. All topics and all their partitions are read to find the most recent 100 messages. +The found messages are streamed in chunks into the screen and can be saved, either individually or by selecting some/all in the table and clicking the `Save selected` button. - +The files are stored in the directory `/apps/rulesservice/definitions//sampledata/`. +If no file name is specified, the name will be `partition__offset_.json`. -After a schema has been selected, its structure with all nested elements is shown and rules can be defined for each field. + -An example would be a rule on the OrderStatus column, which must contain the values C (..Completed) or N (..New). One way would be a a single rule on the column with a OR condition. The other option would be five rules testing for a single status value only and the test type is "Test until first Passes". +## Rules - +### Rule Formula -### Data content +The Single Test rule type applied to a field is used to specify conditions and optionally to modify a value. +The Condition is a formula like `firstname != null` and specifies what the expected rule is - here it is expected that the firstname field has a value. This formula must return true or false. -The result of all rule transformations is stored in the audit array of the record. Querying this data allows detailed reporting which records were processed by what rule and the results. -If the input schema does not have such __audit array, a new version of the schema will be created automatically. - + and if false, the `if test failed...` setting tells the severity. In most cases a false means the test `=Fail`, but it can be set to `=Warn` or even `=Pass`. -### Rule types +Formulas can get more complicated obviously, e.g. `lastname != null && lastname.length() > 10`. -Rules can be applied on fields only and this field acts as the trigger and is used when providing a substitution value. -For example, the SalesOrderHeader has a field SoldTo at root level, hence all rules will be executed once per message. +Formulas can also access other fields as well, e.g. for the field `lastname` the condition is `lastname != null || firstname != null` meaning, either lastname or firstname must be set. -A rule on a field of the SalesItems array will be triggered for each item. Thus the field a rule belongs to controls primarily that. +To access fields of a deeper level, e.g. when lastname is set, the first record in the addresses array must have addresstype HOME, many cases must be considered. What if the addresses array is empty? What if the addresses array is an empty array? What if the addresstype field is null for some records? This would lead to a very long condition but there are shortcuts: `lastname != null && 'HOME' == addresses?[0]?addresstype`. The `?` tells that it is okay if the field is null and it should return null then. -For each field either a single rule "Test a column" can be defined or a Test-Set with multiple rules. +Formulas at a deeper level, e.g. a formula for `addresstype`, can access fields from the higher level via `parent`. Example: `('HOME' == addresstype && parent.parent.lastname != null) || 'HOME' != addresstype` tells that for addresstype HOME the lastname must be set as well, for all other addresstype values there is no such requirement. -- Test for all conditions: Every single rule is applied, thus acting as a AND combination of rules. This is convenient for cases where a field needs to be checked for different conditions, e.g. the OrderDate has to be within three months or raise a warning and the OrderDate cannot be later than the expected delivery date. The rule set will return the lowest individual rule result. If all passed the result is pass. If at least one said warning, the result is a warning. And if one is failed, the result is failed. -- Test until one failed: Each rule will be tested and stopped once the first rule violation is found. It is a AND combination of the rules as well but while in above case all rules are tested, here all other rules are not added to the test result. The rule set will return failed when at least one is failed and pass only if all rules passed. -- Test until one passes: This is an OR condition. If condition1 is not met, maybe condition2 is. As soon as the first positive test is found, no further processing is needed, a valid value has been found. The rule set will return failed only if all tests failed. One positive test makes the entire rule set positive. +Note: In this example the customer record has an addresses array of address record. When within an address record, the parent is the array and its parent is the customer record with the lastname field. -For each individual rule the rule result can be specified if the condition is met. This way the rule can specify the severity, e.g. a SoldTo == null shall be treated as failed, SoldTo.length() < 5 as well but SoldTo.length() < 10 shall be a warning only. -Other tests might not impact the rule result at all, they return passed always. For those the audit array will show that the rule has been tested but the data is of no harm. For example in a gender column the test could be if the value is either M or F and in all other cases a substitution value of X is used. As the gender does return the values M,F or X only, it is to be considered valid. +These are just very basic examples, more below. + +If a condition returns false, maybe the correct value can be derived and then the rule as such has not been violated. The optional formula entered in `..change value to...` is executed only if the condition returns false and it overwrites the field value. Example: The rule for the field `addresstype` is `addresstype.upper() != addresstype` and the change-value formula is `addresstype.upper()`. This does change all values to upper case. In such a case the rule is considered to have been passed instead of failed, and that is accomplished via the `if test failed...` setting `=Pass`. +Each test and its rule result is available in the audit structure and hence we can see that this test was executed and passed. + + +### Rule Sets + +In above examples there was the test `('HOME' == addresstype && parent.parent.lastname != null) || 'HOME' != addresstype`. A more realistic formula would say: if HOME then the lastname must be set, if COMPANY the companyname field must be set and other allowed values are SHIPPING and BILLING. This would get quite a long formula. + +To simplify that, the Test-Set rule allows to specify multiple individual rules + + - Test for all conditions (`Test all`): Every single rule is applied, thus acting as a AND combination of rules. This is convenient for cases where a field needs to be checked for different conditions, e.g. the OrderDate has to be within three months or raise a warning and the OrderDate cannot be later than the expected delivery date. The rule set will return the lowest individual rule result. If all passed the result is pass. If at least one said warning, the result is a warning. And if one is failed, the result is failed. + - Test until one failed (`Until failed`): Each rule will be tested and stopped once the first rule violation is found. It is a AND combination of the rules as well but while in above case all rules are tested, here all other rules are not added to the test result. The rule set will return failed when at least one is failed and pass only if all rules passed. + - Test until one passes (`Until passes`): This is an OR condition. If condition1 is not met, maybe condition2 is. As soon as the first positive test is found, no further processing is needed, a valid value has been found. The rule set will return failed only if all tests failed. One positive test makes the entire rule set positive and the remaining tests are not executed. + +For the addresstype example, the `Until-passes` test set is best suited with the individual rules + + - `'HOME' == addresstype && parent.parent.lastname != null` + - `'COMPANY' == addresstype && parent.parent.companyname != null` + - `'SHIPPING' == addresstype` + - `'BILLING' == addresstype` + +If the addresstype is `SHIPPING`, then the first test returns false, hence the second is executed also returning false and the third condition returns true --> no more tests being made and the test-set is Pass. + +If the addresstype is `ABCD`, none of the conditions will return true --> the test-set is Fail. + +This simplifies creating difficult rules, especially in combination with the `if test failed...` setting, e.g. a `SoldTo == null` shall be treated as failed, `SoldTo.length() < 5` as well but `SoldTo.length() < 10` shall be a warning only. + + +### Generic rules + +Each record also has a `(more)` node to enter rules that do not belong to a single field. Such generic rules cannot have a change-value formula as they are not bound to a field. + + +### Rule Steps + +Another typical scenario is to standardize the values first, e.g. gender should be `M`, `F`, `X`, `?` only and then create rules based on the standardized values. In other words, rules build on each other. To enable that, the rule file consists of multiple tabs - the rule steps - and each tab is executed one after the other. -A more extreme case would be to assign a column with a fixed value. In that case the condition is the forumla "true", the rules result will be pass and the substitution formula the constant to assign. ### Rule syntax -When entering formulas, the columns of the current level can be used directly. For example on the SalesItems level a formula might be "MaterialNumber != null" if the SalesItems structure consists of records that have a MaterialNumber column. The keyword "parent" refers to the parent element, in this example the array of SalesItems. -Thus a formula might be "LineNumber <= parent.length()" if the assumption is, 3 line items are numbered as items 1,2 and 3. +For more examples [see](docs/rule-syntax.md) + + +### FAQs + + * Can a new output column be created via a formula? No, the output schema is always derived from the input schema, for two reasons. First, if adding fields would be possible, it might collide when the input subject is evolved to a new version. The other reason is performance. It would require to create a new output message from scratch, copying the majority of the data even if nothing has changed. That would be too expensive. So the only option is to add the column to the input schema first. -Note: The library used here is [Apache JEXL](https://commons.apache.org/proper/commons-jexl/reference/syntax.html). ## Licensing -This application is provided as dual license. For all users with less than 100'000 messages created per month, the application can be used free of charge and the code falls under a Gnu Public License. Users with more than 100'000 messages are asked to get a commercial license to support further development of this solution. The commercial license is on a monthly pay-per-use basis. +This application is provided as dual license. For all users with less than 100'000 messages processed per month, the application can be used free of charge and the code falls under a Gnu Public License. Users with more than 100'000 messages per month are asked to get a ![commercial license](LICENSE_COMMERCIAL) to support further development of this solution. The commercial license is on a monthly pay-per-use basis. ## Data protection and privacy Every ten minutes the application does send the message statistics via a http call to a central server where the data is stored for information along with the public IP address (usually the IP address of the router). It is just a count which service was invoked how often, no information about endpoints, users, data or URL parameters. This information is collected to get an idea about the adoption. -To disable that, set the environment variable HANAAPPCONTAINERSTATISTICS=FALSE. \ No newline at end of file +To disable that, set the environment variable STATISTICS=FALSE. \ No newline at end of file diff --git a/WebContent/Controller.controller.js b/WebContent/Controller.controller.js new file mode 100644 index 0000000..99dd016 --- /dev/null +++ b/WebContent/Controller.controller.js @@ -0,0 +1,14 @@ +sap.ui.define([ "sap/ui/core/mvc/Controller"], function(Controller) { + "use strict"; + + return Controller.extend("io.rtdi.bigdata.rulesservice.Controller", { + + onInit : function() { + var oModel = new sap.ui.model.json.JSONModel(); + oModel.loadData("./rest/config/service"); + this.getView().setModel(oModel); + } + + }); + +}); diff --git a/WebContent/View.view.xml b/WebContent/View.view.xml new file mode 100644 index 0000000..e04fce2 --- /dev/null +++ b/WebContent/View.view.xml @@ -0,0 +1,116 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/WebContent/WEB-INF/web.xml b/WebContent/WEB-INF/web.xml index 79e59ba..dbfbfa3 100644 --- a/WebContent/WEB-INF/web.xml +++ b/WebContent/WEB-INF/web.xml @@ -4,11 +4,6 @@ RulesService index.html - index.htm - index.jsp - default.html - default.htm - default.jsp jersey-servlet @@ -18,6 +13,7 @@ io.rtdi.bigdata.rulesservice.JerseyApplicationSettings 1 + true jersey-servlet @@ -29,21 +25,22 @@ /ui5/* - connectorview + rulesview + BASIC + + - connectorconfig - connectorview - connectorschema - connectoroperator + rulesview + rulesadmin diff --git a/WebContent/index.html b/WebContent/index.html new file mode 100644 index 0000000..36d9d0e --- /dev/null +++ b/WebContent/index.html @@ -0,0 +1,37 @@ + + + + + + Application Directory + + + + + + + +

Loading OpenUI5 from here

+ + diff --git a/WebContent/ui5/Config.controller.js b/WebContent/ui5/Config.controller.js new file mode 100644 index 0000000..58253ce --- /dev/null +++ b/WebContent/ui5/Config.controller.js @@ -0,0 +1,31 @@ +sap.ui.define(["sap/ui/core/mvc/Controller"], +function(Controller) {"use strict"; +return Controller.extend("io.rtdi.bigdata.rulesservice.ui5.Config", { + onInit : function() { + var model = new sap.ui.model.json.JSONModel(); + model.attachRequestFailed(function(event) { + var text = event.getParameter("responseText"); + sap.m.MessageToast.show("Reading config failed: " + text); + }); + model.loadData("../rest/config"); + this.getView().setModel(model); + }, + onSave : function(event) { + var model = this.getView().getModel(); + var post = new sap.ui.model.json.JSONModel(); + post.attachRequestFailed(function(event) { + var text = event.getParameter("responseText"); + sap.m.MessageToast.show("Save failed: " + text); + }); + post.attachRequestCompleted(function() { + console.log(post.getProperty("/")); + }); + var json = JSON.stringify(model.getProperty("/")); + var headers = { + "Content-Type": "application/json;charset=utf-8" + } + post.loadData("../rest/config", json, true, "POST", false, true, headers); + }, +}); +}); + diff --git a/WebContent/ui5/Config.html b/WebContent/ui5/Config.html new file mode 100644 index 0000000..a0002eb --- /dev/null +++ b/WebContent/ui5/Config.html @@ -0,0 +1,35 @@ + + + + + + Connector + + + + + + +

Loading OpenUI5 from here

+ + + diff --git a/WebContent/ui5/Config.view.xml b/WebContent/ui5/Config.view.xml new file mode 100644 index 0000000..cccb9ae --- /dev/null +++ b/WebContent/ui5/Config.view.xml @@ -0,0 +1,38 @@ + + + + + + +