Skip to content

Commit 3e15bf9

Browse files
committed
improve csv emitter and jdbc emitter (breaking changes in tika-snapshot)
1 parent e00a465 commit 3e15bf9

File tree

7 files changed

+52
-43
lines changed

7 files changed

+52
-43
lines changed

tika-gui-app/pom.xml

+29-29
Original file line numberDiff line numberDiff line change
@@ -194,52 +194,52 @@
194194
<artifactId>download-maven-plugin</artifactId>
195195
<executions>
196196
<execution>
197-
<id>tika-async-cli-3.0.0-20240404.081031-473</id>
197+
<id>tika-async-cli-3.0.0-20240404.161948-475</id>
198198
<phase>prepare-package</phase>
199199
<goals>
200200
<goal>wget</goal>
201201
</goals>
202202
<configuration>
203-
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-async-cli/3.0.0-SNAPSHOT/tika-async-cli-3.0.0-20240404.081031-473.jar</url>
203+
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-async-cli/3.0.0-SNAPSHOT/tika-async-cli-3.0.0-20240404.161948-475.jar</url>
204204
<unpack>false</unpack>
205205
<outputDirectory>${project.build.directory}/lib/tika-core</outputDirectory>
206206
<md5>e46bf085b01462c429353e21b44aba1d</md5>
207207
</configuration>
208208
</execution>
209209
<execution>
210-
<id>tika-fetcher-s3-3.0.0-20240404.081053-477</id>
210+
<id>tika-fetcher-s3-3.0.0-20240404.162001-479</id>
211211
<phase>prepare-package</phase>
212212
<goals>
213213
<goal>wget</goal>
214214
</goals>
215215
<configuration>
216-
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-fetcher-s3/3.0.0-SNAPSHOT/tika-fetcher-s3-3.0.0-20240404.081053-477.jar</url>
216+
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-fetcher-s3/3.0.0-SNAPSHOT/tika-fetcher-s3-3.0.0-20240404.162001-479.jar</url>
217217
<unpack>false</unpack>
218218
<outputDirectory>${project.build.directory}/lib/tika-fetcher-s3</outputDirectory>
219219
<md5>bbe5836d41044a0369d3d29384131836</md5>
220220
</configuration>
221221
</execution>
222222
<execution>
223-
<id>tika-parser-sqlite3-package-3.0.0-20240404.081120-490</id>
223+
<id>tika-parser-sqlite3-package-3.0.0-20240404.162011-492</id>
224224
<phase>prepare-package</phase>
225225
<goals>
226226
<goal>wget</goal>
227227
</goals>
228228
<configuration>
229-
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-parser-sqlite3-package/3.0.0-SNAPSHOT/tika-parser-sqlite3-package-3.0.0-20240404.081120-490.jar</url>
229+
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-parser-sqlite3-package/3.0.0-SNAPSHOT/tika-parser-sqlite3-package-3.0.0-20240404.162011-492.jar</url>
230230
<unpack>false</unpack>
231231
<outputDirectory>${project.build.directory}/lib/tika-app</outputDirectory>
232232
<md5>501126158285629af0dc3fbbe6843185</md5>
233233
</configuration>
234234
</execution>
235235
<execution>
236-
<id>tika-pipes-reporter-fs-status-3.0.0-20240404.081138-473</id>
236+
<id>tika-pipes-reporter-fs-status-3.0.0-20240404.162018-475</id>
237237
<phase>prepare-package</phase>
238238
<goals>
239239
<goal>wget</goal>
240240
</goals>
241241
<configuration>
242-
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-pipes-reporter-fs-status/3.0.0-SNAPSHOT/tika-pipes-reporter-fs-status-3.0.0-20240404.081138-473.jar</url>
242+
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-pipes-reporter-fs-status/3.0.0-SNAPSHOT/tika-pipes-reporter-fs-status-3.0.0-20240404.162018-475.jar</url>
243243
<unpack>false</unpack>
244244
<outputDirectory>${project.build.directory}/lib/tika-core</outputDirectory>
245245
<md5>6c7ec2128406dab85a859ca0aa5d4781</md5>
@@ -259,52 +259,52 @@
259259
</configuration>
260260
</execution>
261261
<execution>
262-
<id>tika-emitter-fs-3.0.0-20240404.081039-477</id>
262+
<id>tika-emitter-fs-3.0.0-20240404.161951-479</id>
263263
<phase>prepare-package</phase>
264264
<goals>
265265
<goal>wget</goal>
266266
</goals>
267267
<configuration>
268-
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-fs/3.0.0-SNAPSHOT/tika-emitter-fs-3.0.0-20240404.081039-477.jar</url>
268+
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-fs/3.0.0-SNAPSHOT/tika-emitter-fs-3.0.0-20240404.161951-479.jar</url>
269269
<unpack>false</unpack>
270270
<outputDirectory>${project.build.directory}/lib/tika-emitter-fs</outputDirectory>
271271
<md5>e8be348519559c1a925f86d2120852cb</md5>
272272
</configuration>
273273
</execution>
274274
<execution>
275-
<id>tika-emitter-jdbc-3.0.0-20240404.081041-473</id>
275+
<id>tika-emitter-jdbc-3.0.0-20240404.161952-475</id>
276276
<phase>prepare-package</phase>
277277
<goals>
278278
<goal>wget</goal>
279279
</goals>
280280
<configuration>
281-
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-jdbc/3.0.0-SNAPSHOT/tika-emitter-jdbc-3.0.0-20240404.081041-473.jar</url>
281+
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-jdbc/3.0.0-SNAPSHOT/tika-emitter-jdbc-3.0.0-20240404.161952-475.jar</url>
282282
<unpack>false</unpack>
283283
<outputDirectory>${project.build.directory}/lib/tika-emitter-jdbc</outputDirectory>
284284
<md5>90078d960cacf9bf16bdbf0cdd57e179</md5>
285285
</configuration>
286286
</execution>
287287
<execution>
288-
<id>tika-pipes-reporter-jdbc-3.0.0-20240404.081138-473</id>
288+
<id>tika-pipes-reporter-jdbc-3.0.0-20240404.162018-475</id>
289289
<phase>prepare-package</phase>
290290
<goals>
291291
<goal>wget</goal>
292292
</goals>
293293
<configuration>
294-
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-pipes-reporter-jdbc/3.0.0-SNAPSHOT/tika-pipes-reporter-jdbc-3.0.0-20240404.081138-473.jar</url>
294+
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-pipes-reporter-jdbc/3.0.0-SNAPSHOT/tika-pipes-reporter-jdbc-3.0.0-20240404.162018-475.jar</url>
295295
<unpack>false</unpack>
296296
<outputDirectory>${project.build.directory}/lib/tika-core</outputDirectory>
297-
<md5>643daa6dfe8a1695735d271815e186ed</md5>
297+
<md5>48efffb7ca433f1c866d9838e117f4db</md5>
298298
</configuration>
299299
</execution>
300300
<execution>
301-
<id>tika-eval-core-3.0.0-20240404.081048-472</id>
301+
<id>tika-eval-core-3.0.0-20240404.161957-474</id>
302302
<phase>prepare-package</phase>
303303
<goals>
304304
<goal>wget</goal>
305305
</goals>
306306
<configuration>
307-
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-eval-core/3.0.0-SNAPSHOT/tika-eval-core-3.0.0-20240404.081048-472.jar</url>
307+
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-eval-core/3.0.0-SNAPSHOT/tika-eval-core-3.0.0-20240404.161957-474.jar</url>
308308
<unpack>false</unpack>
309309
<outputDirectory>${project.build.directory}/lib/tika-extras</outputDirectory>
310310
<md5>b458334817f28da268e07974d99b03b7</md5>
@@ -324,13 +324,13 @@
324324
</configuration>
325325
</execution>
326326
<execution>
327-
<id>tika-pipes-iterator-s3-3.0.0-20240404.081135-473</id>
327+
<id>tika-pipes-iterator-s3-3.0.0-20240404.162017-475</id>
328328
<phase>prepare-package</phase>
329329
<goals>
330330
<goal>wget</goal>
331331
</goals>
332332
<configuration>
333-
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-pipes-iterator-s3/3.0.0-SNAPSHOT/tika-pipes-iterator-s3-3.0.0-20240404.081135-473.jar</url>
333+
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-pipes-iterator-s3/3.0.0-SNAPSHOT/tika-pipes-iterator-s3-3.0.0-20240404.162017-475.jar</url>
334334
<unpack>false</unpack>
335335
<outputDirectory>${project.build.directory}/lib/tika-pipes-iterator-s3</outputDirectory>
336336
<md5>01b46c92d0d8f9f352172d49e88b893f</md5>
@@ -350,65 +350,65 @@
350350
</configuration>
351351
</execution>
352352
<execution>
353-
<id>tika-serialization-3.0.0-20240404.081141-493</id>
353+
<id>tika-serialization-3.0.0-20240404.162019-495</id>
354354
<phase>prepare-package</phase>
355355
<goals>
356356
<goal>wget</goal>
357357
</goals>
358358
<configuration>
359-
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-serialization/3.0.0-SNAPSHOT/tika-serialization-3.0.0-20240404.081141-493.jar</url>
359+
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-serialization/3.0.0-SNAPSHOT/tika-serialization-3.0.0-20240404.162019-495.jar</url>
360360
<unpack>false</unpack>
361361
<outputDirectory>${project.build.directory}/lib/tika-core</outputDirectory>
362362
<md5>8cfb9502171c9462341f0e3e93a0e19e</md5>
363363
</configuration>
364364
</execution>
365365
<execution>
366-
<id>tika-detector-siegfried-3.0.0-20240404.081036-471</id>
366+
<id>tika-detector-siegfried-3.0.0-20240404.161950-473</id>
367367
<phase>prepare-package</phase>
368368
<goals>
369369
<goal>wget</goal>
370370
</goals>
371371
<configuration>
372-
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-detector-siegfried/3.0.0-SNAPSHOT/tika-detector-siegfried-3.0.0-20240404.081036-471.jar</url>
372+
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-detector-siegfried/3.0.0-SNAPSHOT/tika-detector-siegfried-3.0.0-20240404.161950-473.jar</url>
373373
<unpack>false</unpack>
374374
<outputDirectory>${project.build.directory}/lib/tika-extras</outputDirectory>
375375
<md5>f1d481f50f8e33997c559cc2eed82120</md5>
376376
</configuration>
377377
</execution>
378378
<execution>
379-
<id>tika-emitter-opensearch-3.0.0-20240404.081042-473</id>
379+
<id>tika-emitter-opensearch-3.0.0-20240404.161953-475</id>
380380
<phase>prepare-package</phase>
381381
<goals>
382382
<goal>wget</goal>
383383
</goals>
384384
<configuration>
385-
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-opensearch/3.0.0-SNAPSHOT/tika-emitter-opensearch-3.0.0-20240404.081042-473.jar</url>
385+
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-opensearch/3.0.0-SNAPSHOT/tika-emitter-opensearch-3.0.0-20240404.161953-475.jar</url>
386386
<unpack>false</unpack>
387387
<outputDirectory>${project.build.directory}/lib/tika-emitter-opensearch</outputDirectory>
388388
<md5>c9ee40170c87a2234786d721255f5f96</md5>
389389
</configuration>
390390
</execution>
391391
<execution>
392-
<id>tika-emitter-s3-3.0.0-20240404.081043-477</id>
392+
<id>tika-emitter-s3-3.0.0-20240404.161953-479</id>
393393
<phase>prepare-package</phase>
394394
<goals>
395395
<goal>wget</goal>
396396
</goals>
397397
<configuration>
398-
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-s3/3.0.0-SNAPSHOT/tika-emitter-s3-3.0.0-20240404.081043-477.jar</url>
398+
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-emitter-s3/3.0.0-SNAPSHOT/tika-emitter-s3-3.0.0-20240404.161953-479.jar</url>
399399
<unpack>false</unpack>
400400
<outputDirectory>${project.build.directory}/lib/tika-emitter-s3</outputDirectory>
401401
<md5>577e03105df2137ad64b726606095d7e</md5>
402402
</configuration>
403403
</execution>
404404
<execution>
405-
<id>tika-app-3.0.0-20240404.081029-473</id>
405+
<id>tika-app-3.0.0-20240404.161946-475</id>
406406
<phase>prepare-package</phase>
407407
<goals>
408408
<goal>wget</goal>
409409
</goals>
410410
<configuration>
411-
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-app/3.0.0-SNAPSHOT/tika-app-3.0.0-20240404.081029-473.jar</url>
411+
<url>https://repository.apache.org/content/groups/snapshots/org/apache/tika/tika-app/3.0.0-SNAPSHOT/tika-app-3.0.0-20240404.161946-475.jar</url>
412412
<unpack>false</unpack>
413413
<outputDirectory>${project.build.directory}/lib/tika-app</outputDirectory>
414414
<md5>f5f93da4a09b5f058ef697a2f9939f16</md5>

tika-gui-app/src/main/java/module-info.java

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
requires com.fasterxml.jackson.datatype.jsr310;
2929
requires com.fasterxml.jackson.datatype.jdk8;
3030
requires org.kordamp.ikonli.javafx;
31+
requires com.h2database;
3132

3233
exports org.tallison.tika.app.fx;
3334

tika-gui-app/src/main/java/org/tallison/tika/app/fx/ctx/AppContext.java

+8
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,19 @@ public class AppContext {
6363
}
6464

6565
static {
66+
System.out.println(System.getProperties());
6667
if (!StringUtils.isBlank(System.getProperty("TIKA_GUI_JAVA_HOME"))) {
6768
LOGGER.debug("setting TIKA_GUI_JAVA_HOME {}", System.getProperty("TIKA_GUI_JAVA_HOME"));
6869
TIKA_GUI_JAVA_HOME = Paths.get(System.getProperty("TIKA_GUI_JAVA_HOME"));
6970
} else if (!StringUtils.isBlank(System.getProperty("java.home"))) {
7071
TIKA_GUI_JAVA_HOME = Paths.get(System.getProperty("java.home"));
72+
//TODO -- java_home should not include the bin directory.
73+
//the "if" branch above is normally triggered through the .sh scripts,
74+
//which incorrectly set java_home to java_home/bin
75+
//Clean this up.
76+
if (Files.isDirectory(TIKA_GUI_JAVA_HOME.resolve("bin"))) {
77+
TIKA_GUI_JAVA_HOME = TIKA_GUI_JAVA_HOME.resolve("bin");
78+
}
7179
LOGGER.debug("setting TIKA_GUI_JAVA_HOME {} from java.home",
7280
System.getProperty("java.home"));
7381
}

tika-gui-app/src/main/java/org/tallison/tika/app/fx/emitters/CSVEmitterSpec.java

+6-6
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ private void createTable() throws SQLException {
9494
String dropTable = "drop table if exists " + tableName;
9595
StringBuilder createTable = new StringBuilder();
9696
createTable.append("create table ").append(tableName);
97-
createTable.append("( ").append(PATH_COL_NAME).append(" varchar(1024), ");
97+
createTable.append("( ").append(ID_COLUMN_NAME).append(" varchar(1024), ");
9898
createTable.append(ATTACHMENT_NUM_COL_NAME).append(" int");
9999
for (MetadataTuple t : getMetadataTuples()) {
100100
createTable.append(", ").append(t.getOutput()).append(" ").append(t.getProperty());
@@ -191,7 +191,7 @@ private Optional<Path> getCSVPath() {
191191

192192
private void writeHeaders(CSVPrinter printer) throws IOException {
193193
List<String> headers = new ArrayList<>();
194-
headers.add("path");
194+
headers.add("id");
195195
headers.add("status");
196196
headers.add("attachment_num");
197197
if (getMetadataTuples().size() == 0) {
@@ -208,22 +208,22 @@ private String getSelect() {
208208
String tikaTable = CSV_DB_TABLE_NAME;
209209
StringBuilder sb = new StringBuilder();
210210
sb.append("select ");
211-
sb.append("s.").append(PATH_COL_NAME).append(" as Path, s.status as Status, ");
211+
sb.append("s.").append(ID_COLUMN_NAME).append(" as id, s.status as Status, ");
212212
sb.append("case when ").append(ATTACHMENT_NUM_COL_NAME).append(" is null then 0");
213213
sb.append(" else ").append(ATTACHMENT_NUM_COL_NAME).append(" end");
214214
for (MetadataTuple t : getMetadataTuples()) {
215215
sb.append(", ");
216216
String out = t.getOutput();
217217
//if there's a column in tika_extracts
218-
if (out.equals(PATH_COL_NAME) || out.equals("status")) {
218+
if (out.equals(ID_COLUMN_NAME) || out.equals("status")) {
219219
sb.append("t.");
220220
}
221221
sb.append(t.getOutput());
222222
}
223223

224224
sb.append(" from tika_status s left join ").append(tikaTable)
225-
.append(" t on s.path = t.path")
226-
.append(" order by s.status, t.path asc, t.attachment_num asc");
225+
.append(" t on s.id = t.id")
226+
.append(" order by s.status, t.id asc, t.attachment_num asc");
227227

228228
return sb.toString();
229229
}

tika-gui-app/src/main/java/org/tallison/tika/app/fx/emitters/JDBCEmitterController.java

+4-4
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
package org.tallison.tika.app.fx.emitters;
1818

1919
import static org.tallison.tika.app.fx.emitters.JDBCEmitterSpec.ATTACHMENT_NUM_COL_NAME;
20-
import static org.tallison.tika.app.fx.emitters.JDBCEmitterSpec.PATH_COL_NAME;
20+
import static org.tallison.tika.app.fx.emitters.JDBCEmitterSpec.ID_COLUMN_NAME;
2121

2222
import java.net.URL;
2323
import java.sql.Connection;
@@ -333,9 +333,9 @@ private boolean validateColumns(ResultSetMetaData metaData) throws SQLException
333333
//TODO -- check column types!
334334
for (int i = 1; i <= metaData.getColumnCount(); i++) {
335335
if (i == 1) {
336-
if (!PATH_COL_NAME.equalsIgnoreCase(metaData.getColumnName(i))) {
336+
if (!ID_COLUMN_NAME.equalsIgnoreCase(metaData.getColumnName(i))) {
337337
alert(ALERT_TITLE, "Unexpected column name",
338-
"First column should be: " + PATH_COL_NAME);
338+
"First column should be: " + ID_COLUMN_NAME);
339339
return false;
340340
}
341341
}
@@ -376,7 +376,7 @@ private boolean validateColumns(ResultSetMetaData metaData) throws SQLException
376376
private boolean tryToCreateTable() {
377377
StringBuilder sb = new StringBuilder();
378378
sb.append("create table ").append(tableName.getText()).append(" (");
379-
sb.append(PATH_COL_NAME).append(" VARCHAR(1024),\n");
379+
sb.append(ID_COLUMN_NAME).append(" VARCHAR(1024),\n");
380380
sb.append(ATTACHMENT_NUM_COL_NAME).append(" INTEGER");
381381
for (MetadataRow r : getMetadataRows()) {
382382
sb.append(",\n");

tika-gui-app/src/main/java/org/tallison/tika/app/fx/emitters/JDBCEmitterSpec.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ public class JDBCEmitterSpec extends BaseEmitterSpec {
3939

4040
private static final Logger LOGGER = LogManager.getLogger(JDBCEmitterSpec.class);
4141

42-
static String PATH_COL_NAME = "path";
42+
static String ID_COLUMN_NAME = "id";
4343

4444
static String ATTACHMENT_NUM_COL_NAME = "attachment_num";
4545

@@ -108,7 +108,7 @@ public void setConnectionString(String connectionString) {
108108
void createAndSetInsertString(String tableName) {
109109
StringBuilder sb = new StringBuilder();
110110
sb.append("insert into ").append(tableName).append(" (");
111-
sb.append(PATH_COL_NAME).append(", ").append(ATTACHMENT_NUM_COL_NAME);
111+
sb.append(ID_COLUMN_NAME).append(", ").append(ATTACHMENT_NUM_COL_NAME);
112112
int colCount = 2;
113113
for (MetadataTuple t : getMetadataTuples()) {
114114
sb.append(", ");

tika-gui-app/src/main/java/org/tallison/tika/app/fx/emitters/OpenSearchEmitterSpec.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,8 @@ public void write(DomWriter writer, Element properties) {
8181
public Set<String> getClassPathDependencies() {
8282
Set<String> items = new HashSet<>();
8383
items.add(ProcessUtils.escapeCommandLine(
84-
AppContext.TIKA_LIB_PATH.resolve("tika-emitter-opensearch").toAbsolutePath() +
85-
"/*"));
84+
AppContext.TIKA_LIB_PATH.resolve("tika-emitter-opensearch")
85+
.toAbsolutePath() + "/*"));
8686

8787
return items;
8888
}

0 commit comments

Comments
 (0)