From cd806c57b2d2228156ed18ee7d938e8694708f02 Mon Sep 17 00:00:00 2001 From: Dan Fandrich Date: Mon, 2 Sep 2024 08:14:38 -0700 Subject: [PATCH] Handle < and > for HTML in autodescribe These are probably the two most common character entities, so might as well convert them. Using an HTML parser would be better (or calibre) but the simple text scanning method for HTML is much more likely to have all the required dependencies available, and works just as well in most real world cases. --- autodescribe | 2 +- test-autodescribe-expected | 2 +- testfiles/type1.html | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/autodescribe b/autodescribe index de5c165..e750d1f 100755 --- a/autodescribe +++ b/autodescribe @@ -1059,7 +1059,7 @@ comment_first_line () { # File type: html (HTML text) comment_html () { - COMMENT=$(sed -n 's,^.*<[tT][iI][tT][lL][eE]\>[^>]*>\(.*\)[^>]*>\(.*\)/g' | head -1) } # File type: kdenlive (Kdenlive file) diff --git a/test-autodescribe-expected b/test-autodescribe-expected index 81094a7..3616d55 100644 --- a/test-autodescribe-expected +++ b/test-autodescribe-expected @@ -28,7 +28,7 @@ 'testfiles/type1.gif' 'GIF comment' 'testfiles/type1.gnumeric' 'Gnumeric Title' 'testfiles/type1.gpx' 'GPX metadata name' -'testfiles/type1.html' 'HTML title is description' +'testfiles/type1.html' 'HTML title description' 'testfiles/type1.ics' 'iCalendar summary' 'testfiles/type1.iso' 'volume_id' 'testfiles/type1.jar' 'Jar Application Name' diff --git a/testfiles/type1.html b/testfiles/type1.html index b4818a9..48e2c45 100644 --- a/testfiles/type1.html +++ b/testfiles/type1.html @@ -1,6 +1,6 @@ - HTML title is description + HTML title <is> description Nothing to see here...