-69
-70
-71
-72
73
74
75
@@ -234,10 +230,15 @@
80
81
82
-83
+83
+84
+85
+86
+87
+88
- # File 'lib/unisec/versions.rb', line 69
+ # File 'lib/unisec/versions.rb', line 73
def self.display # rubocop:disable Metrics/AbcSize
data = versions
@@ -249,6 +250,7 @@
display.call(:twittercldr_icu)
display.call(:twittercldr_cldr)
display.call(:ruby_unicode_emoji)
+ display.call(:ucd_derivedname)
puts Paint["\nGems:", :underline]
display.call(:unisec)
display.call(:twittercldr)
@@ -351,7 +353,11 @@
53
54
55
-56
+56
+57
+58
+59
+60
|
# File 'lib/unisec/versions.rb', line 17
@@ -393,6 +399,10 @@
unicodeconfusable_unicode: {
version: Unicode::Confusable::UNICODE_VERSION,
label: 'Unicode (unicode-confusable gem)'
+ },
+ ucd_derivedname: {
+ version: Unisec::Rugrep.ucd_derivedname_version,
+ label: 'UCD (data/DerivedName.txt)'
}
}
end
@@ -406,9 +416,9 @@
diff --git a/docs/yard/_index.html b/docs/yard/_index.html
index a99e943..bd98dea 100644
--- a/docs/yard/_index.html
+++ b/docs/yard/_index.html
@@ -165,6 +165,21 @@ Namespace Listing A-Z
+
+ - G
+
+
+ -
+ Grep
+
+ (Unisec::CLI::Commands)
+
+
+
+
+
+
+
- H
@@ -244,6 +259,9 @@ Namespace Listing A-Z
+ |
+
+
- R
@@ -255,13 +273,17 @@ Namespace Listing A-Z
+ -
+ Rugrep
+
+ (Unisec)
+
+
+
- |
-
-
- S
@@ -370,9 +392,9 @@ Namespace Listing A-Z
diff --git a/docs/yard/class_list.html b/docs/yard/class_list.html
index 3a4973c..6a87760 100644
--- a/docs/yard/class_list.html
+++ b/docs/yard/class_list.html
@@ -43,7 +43,7 @@
-Integer < Object Top Level Namespace List < Command Unisec::CLI::Commands::Confusables Randomize < Command Unisec::CLI::Commands::Confusables
Hexdump < Command Unisec::CLI::Commands Char < Command Unisec::CLI::Commands::Properties Codepoints < Command Unisec::CLI::Commands::Properties List < Command Unisec::CLI::Commands::Properties
Size < Command Unisec::CLI::Commands From < Command Unisec::CLI::Commands::Surrogates To < Command Unisec::CLI::Commands::Surrogates
+Integer < Object Top Level Namespace List < Command Unisec::CLI::Commands::Confusables Randomize < Command Unisec::CLI::Commands::Confusables
Grep < Command Unisec::CLI::Commands Hexdump < Command Unisec::CLI::Commands Char < Command Unisec::CLI::Commands::Properties Codepoints < Command Unisec::CLI::Commands::Properties List < Command Unisec::CLI::Commands::Properties
Size < Command Unisec::CLI::Commands From < Command Unisec::CLI::Commands::Surrogates To < Command Unisec::CLI::Commands::Surrogates
diff --git a/docs/yard/file.CHANGELOG.html b/docs/yard/file.CHANGELOG.html
index 157fde6..b98b7c2 100644
--- a/docs/yard/file.CHANGELOG.html
+++ b/docs/yard/file.CHANGELOG.html
@@ -58,6 +58,17 @@
[unreleased]
+ [0.0.3]
+ Features
+
+- Add a new class
Unisec::Rugrep and CLI command grep to search for Unicode code point names by regular expression
+- Add a new method
Unisec::Properties.deccp2stdhexcp : Convert from decimal code point to standardized format hexadecimal code point
+
+ Chore
+
+- Enhance tests:
assert_equal(true, test) ➡️ assert(test)
+- Enhance SEO: better description
+
[0.0.2]
- Add 2 new classes (and corresponding CLI command):
@@ -74,9 +85,9 @@
[0.0.1]
diff --git a/docs/yard/file.LICENSE.html b/docs/yard/file.LICENSE.html
index de820dd..be0ff02 100644
--- a/docs/yard/file.LICENSE.html
+++ b/docs/yard/file.LICENSE.html
@@ -77,9 +77,9 @@
diff --git a/docs/yard/file.README.html b/docs/yard/file.README.html
index 5179502..c4b6c14 100644
--- a/docs/yard/file.README.html
+++ b/docs/yard/file.README.html
@@ -90,6 +90,11 @@ Features
- List all Unicode properties name
+- Regexp search
+
+- Search for Unicode code point names by regular expression
+
+
- Size
- Code point, grapheme, UTF-8/UTF-16/UTF-32 byte/unit size
@@ -102,7 +107,7 @@ Features
- Versions
-- Version of Unicode, ICU, CLDR, gems used in Unisec
+- Version of Unicode, ICU, CLDR, UCD, gems used in Unisec
@@ -120,9 +125,9 @@ Author
diff --git a/docs/yard/file.about.html b/docs/yard/file.about.html
index b80737d..b324812 100644
--- a/docs/yard/file.about.html
+++ b/docs/yard/file.about.html
@@ -67,9 +67,9 @@ User documentation
diff --git a/docs/yard/file.documentation.html b/docs/yard/file.documentation.html
index 618edca..143f0c0 100644
--- a/docs/yard/file.documentation.html
+++ b/docs/yard/file.documentation.html
@@ -76,9 +76,9 @@ Build & serve locally
diff --git a/docs/yard/file.install.html b/docs/yard/file.install.html
index 902bcd9..ade514e 100644
--- a/docs/yard/file.install.html
+++ b/docs/yard/file.install.html
@@ -111,9 +111,9 @@ No install
diff --git a/docs/yard/file.publishing.html b/docs/yard/file.publishing.html
index dc2cc0e..077497d 100644
--- a/docs/yard/file.publishing.html
+++ b/docs/yard/file.publishing.html
@@ -84,9 +84,9 @@
diff --git a/docs/yard/file.quick-start.html b/docs/yard/file.quick-start.html
index 3626322..5747992 100644
--- a/docs/yard/file.quick-start.html
+++ b/docs/yard/file.quick-start.html
@@ -78,9 +78,9 @@ Default usage: library
diff --git a/docs/yard/file.usage.html b/docs/yard/file.usage.html
index ba50a29..249d498 100644
--- a/docs/yard/file.usage.html
+++ b/docs/yard/file.usage.html
@@ -64,9 +64,12 @@ General help
$ unisec --help
Commands:
unisec confusables [SUBCOMMAND]
+ unisec grep REGEXP # Search for Unicode code point names by regular expression
unisec hexdump INPUT # Hexdump in all Unicode encodings
unisec properties [SUBCOMMAND]
+ unisec size INPUT # All kinf of size information about a Unicode string
unisec surrogates [SUBCOMMAND]
+ unisec versions # Version of anything related to Unicode as used in unisec
List sub-commands:
$ unisec surrogates --help
@@ -100,6 +103,7 @@ Examples
Randomize
+Grep
Hexdump
Properties
@@ -123,7 +127,8 @@ Library
+Regexp search
+
+- Search for Unicode code point names by regular expression
+
+
Size
- Code point, grapheme, UTF-8/UTF-16/UTF-32 byte/unit size
@@ -102,7 +107,7 @@ Features
Versions
-- Version of Unicode, ICU, CLDR, gems used in Unisec
+- Version of Unicode, ICU, CLDR, UCD, gems used in Unisec
@@ -120,9 +125,9 @@ Author
diff --git a/docs/yard/method_list.html b/docs/yard/method_list.html
index 2a8c69f..85c2684 100644
--- a/docs/yard/method_list.html
+++ b/docs/yard/method_list.html
@@ -61,6 +61,14 @@
+
+ #call
+ Unisec::CLI::Commands::Grep
+
+
+
+
+
#call
Unisec::CLI::Commands::Hexdump
@@ -68,7 +76,7 @@
-
+
#call
Unisec::CLI::Commands::Versions
@@ -76,7 +84,7 @@
-
+
#call
Unisec::CLI::Commands::Properties::List
@@ -84,7 +92,7 @@
-
+
#call
Unisec::CLI::Commands::Properties::Codepoints
@@ -92,7 +100,7 @@
-
+
#call
Unisec::CLI::Commands::Properties::Char
@@ -100,7 +108,7 @@
-
+
#call
Unisec::CLI::Commands::Surrogates::From
@@ -108,7 +116,7 @@
-
+
#call
Unisec::CLI::Commands::Surrogates::To
@@ -116,7 +124,7 @@
-
+
#call
Unisec::CLI::Commands::Confusables::List
@@ -124,7 +132,7 @@
-
+
#call
Unisec::CLI::Commands::Confusables::Randomize
@@ -132,7 +140,7 @@
-
+
char
Unisec::Properties
@@ -140,7 +148,7 @@
-
+
char2codepoint
Unisec::Properties
@@ -148,7 +156,7 @@
-
+
char_display
Unisec::Properties
@@ -156,7 +164,7 @@
-
+
chars2codepoints
Unisec::Properties
@@ -164,7 +172,7 @@
-
+
code_point
Unisec::Surrogates
@@ -172,7 +180,7 @@
-
+
#code_point
Unisec::Surrogates
@@ -180,7 +188,7 @@
-
+
#code_points_size
Unisec::Size
@@ -188,7 +196,7 @@
-
+
code_points_size
Unisec::Size
@@ -196,7 +204,7 @@
-
+
codepoints
Unisec::Properties
@@ -204,7 +212,7 @@
-
+
codepoints_display
Unisec::Properties
@@ -212,7 +220,7 @@
-
+
convert
Unisec::Utils::String
@@ -220,7 +228,7 @@
-
+
convert_to_integer
Unisec::Utils::String
@@ -228,7 +236,7 @@
-
+
#cp
Unisec::Surrogates
@@ -236,6 +244,14 @@
+
+
+
+
+
#display
@@ -396,6 +412,38 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
#to_bin
@@ -413,6 +461,14 @@
+
+
+
+
+
#utf16_bytesize
Unisec::Size
@@ -420,7 +476,7 @@
-
+
utf16_bytesize
Unisec::Size
@@ -428,7 +484,7 @@
-
+
#utf16_unitsize
Unisec::Size
@@ -436,7 +492,7 @@
-
+
utf16_unitsize
Unisec::Size
@@ -444,7 +500,7 @@
-
+
#utf16be
Unisec::Hexdump
@@ -452,7 +508,7 @@
-
+
utf16be
Unisec::Hexdump
@@ -460,7 +516,7 @@
-
+
#utf16le
Unisec::Hexdump
@@ -468,7 +524,7 @@
-
+
utf16le
Unisec::Hexdump
@@ -476,7 +532,7 @@
-
+
#utf32_bytesize
Unisec::Size
@@ -484,7 +540,7 @@
-
+
utf32_bytesize
Unisec::Size
@@ -492,7 +548,7 @@
-
+
#utf32_unitsize
Unisec::Size
@@ -500,7 +556,7 @@
-
+
utf32_unitsize
Unisec::Size
@@ -508,7 +564,7 @@
-
+
#utf32be
Unisec::Hexdump
@@ -516,7 +572,7 @@
-
+
utf32be
Unisec::Hexdump
@@ -524,7 +580,7 @@
-
+
#utf32le
Unisec::Hexdump
@@ -532,7 +588,7 @@
-
+
utf32le
Unisec::Hexdump
@@ -540,7 +596,7 @@
-
+
#utf8
Unisec::Hexdump
@@ -548,7 +604,7 @@
-
+
utf8
Unisec::Hexdump
@@ -556,7 +612,7 @@
-
+
#utf8_bytesize
Unisec::Size
@@ -564,7 +620,7 @@
-
+
utf8_bytesize
Unisec::Size
@@ -572,7 +628,7 @@
-
+
#utf8_unitsize
Unisec::Size
@@ -580,7 +636,7 @@
-
+
utf8_unitsize
Unisec::Size
@@ -588,7 +644,7 @@
-
+
versions
Unisec::Versions
diff --git a/docs/yard/top-level-namespace.html b/docs/yard/top-level-namespace.html
index 472ae39..a560b6a 100644
--- a/docs/yard/top-level-namespace.html
+++ b/docs/yard/top-level-namespace.html
@@ -102,9 +102,9 @@ Defined Under Namespace
diff --git a/lib/unisec.rb b/lib/unisec.rb
index 958a73b..beed5cf 100644
--- a/lib/unisec.rb
+++ b/lib/unisec.rb
@@ -2,9 +2,10 @@
require 'unisec/version'
-require 'unisec/surrogates'
+require 'unisec/confusables'
require 'unisec/hexdump'
require 'unisec/properties'
-require 'unisec/confusables'
-require 'unisec/versions'
+require 'unisec/rugrep'
require 'unisec/size'
+require 'unisec/surrogates'
+require 'unisec/versions'
diff --git a/lib/unisec/cli/cli.rb b/lib/unisec/cli/cli.rb
index 080f62e..e092419 100644
--- a/lib/unisec/cli/cli.rb
+++ b/lib/unisec/cli/cli.rb
@@ -1,11 +1,12 @@
# frozen_string_literal: true
-require 'unisec/cli/surrogates'
+require 'unisec/cli/confusables'
require 'unisec/cli/hexdump'
require 'unisec/cli/properties'
-require 'unisec/cli/confusables'
-require 'unisec/cli/versions'
+require 'unisec/cli/rugrep'
require 'unisec/cli/size'
+require 'unisec/cli/surrogates'
+require 'unisec/cli/versions'
module Unisec
# Module used to create the CLI for the executable
@@ -16,16 +17,17 @@ module Commands
# Mapping between the (sub-)commands as seen by the user
# on the command-line interface and the CLI modules in the lib
- register 'surrogates to', Surrogates::To
- register 'surrogates from', Surrogates::From
- register 'hexdump', Hexdump
- register 'properties list', Properties::List
- register 'properties codepoints', Properties::Codepoints
- register 'properties char', Properties::Char
register 'confusables list', Confusables::List
register 'confusables randomize', Confusables::Randomize
- register 'versions', Versions
+ register 'grep', Grep
+ register 'hexdump', Hexdump
+ register 'properties char', Properties::Char
+ register 'properties codepoints', Properties::Codepoints
+ register 'properties list', Properties::List
register 'size', Size
+ register 'surrogates from', Surrogates::From
+ register 'surrogates to', Surrogates::To
+ register 'versions', Versions
end
end
end
diff --git a/lib/unisec/cli/rugrep.rb b/lib/unisec/cli/rugrep.rb
new file mode 100644
index 0000000..f64ef03
--- /dev/null
+++ b/lib/unisec/cli/rugrep.rb
@@ -0,0 +1,33 @@
+# frozen_string_literal: true
+
+require 'dry/cli'
+require 'unisec'
+
+module Unisec
+ module CLI
+ module Commands
+ # CLI command `unisec grep` for the class {Unisec::Rugrep} from the lib.
+ #
+ # Example:
+ #
+ # ```plaintext
+ # $ unisec grep 'FRENCH \w+'
+ # U+20A3 ₣ FRENCH FRANC SIGN
+ # U+1F35F 🍟 FRENCH FRIES
+ # ```
+ class Grep < Dry::CLI::Command
+ desc 'Search for Unicode code point names by regular expression'
+
+ argument :regexp, required: true,
+ desc: 'regular expression'
+
+ # Hexdump of all Unicode encodings.
+ # @param regexp [Regexp] Regular expression without delimiters or modifiers.
+ # Supports everything Ruby Regexp supports
+ def call(regexp: nil, **)
+ puts Unisec::Rugrep.regrep_display(regexp)
+ end
+ end
+ end
+ end
+end
diff --git a/lib/unisec/properties.rb b/lib/unisec/properties.rb
index 620be2a..3467f2c 100644
--- a/lib/unisec/properties.rb
+++ b/lib/unisec/properties.rb
@@ -50,7 +50,7 @@ def self.codepoints(prop)
def self.codepoints_display(prop)
codepoints = Properties.codepoints(prop)
codepoints.each do |cp|
- puts "#{Properties.char2codepoint(cp[:char]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
+ puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
end
nil
end
@@ -158,7 +158,7 @@ def self.char_display(chr, extended: false)
# @example
# Unisec::Properties.char2codepoint('💎') # => "U+1F48E"
def self.char2codepoint(chr)
- "U+#{format('%.4x', chr.codepoints.first).upcase}"
+ Properties.deccp2stdhexcp(chr.codepoints.first)
end
# Display the code points in Unicode format for the given characters (code points as string)
@@ -174,5 +174,14 @@ def self.chars2codepoints(chrs)
end
out.join(' ')
end
+
+ # Convert from decimal code point to standardized format hexadecimal code point
+ # @param int_cp [Integer] Code point in decimal format
+ # @return [String] code point in Unicode format
+ # @example
+ # Unisec::Properties.intcp2stdhexcp(128640) # => "U+1F680"
+ def self.deccp2stdhexcp(int_cp)
+ "U+#{format('%.4x', int_cp).upcase}"
+ end
end
end
diff --git a/lib/unisec/rugrep.rb b/lib/unisec/rugrep.rb
new file mode 100644
index 0000000..d18678b
--- /dev/null
+++ b/lib/unisec/rugrep.rb
@@ -0,0 +1,126 @@
+# frozen_string_literal: true
+
+require 'twitter_cldr'
+require 'paint'
+
+module Unisec
+ # Ruby grep : Ruby regular expression search for Unicode code point names
+ class Rugrep
+ # UCD Derived names file location
+ # @see https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedName.txt
+ UCD_DERIVEDNAME = File.join(__dir__, '../../data/DerivedName.txt')
+
+ # Search code points by (Ruby) regexp
+ # @param regexp [Regexp] Regular expression without delimiters or modifiers.
+ # Supports everything Ruby Regexp supports
+ # @return [Array] Array of code points (`{char: String, codepoint: Integer, name: String}`)
+ # @example
+ # Unisec::Rugrep.regrep('snowman|snowflake')
+ # # =>
+ # # [{:char=>"☃", :codepoint=>9731, :name=>"SNOWMAN"},
+ # # {:char=>"⛄", :codepoint=>9924, :name=>"SNOWMAN WITHOUT SNOW"},
+ # # {:char=>"⛇", :codepoint=>9927, :name=>"BLACK SNOWMAN"},
+ # # {:char=>"❄", :codepoint=>10052, :name=>"SNOWFLAKE"},
+ # # {:char=>"❅", :codepoint=>10053, :name=>"TIGHT TRIFOLIATE SNOWFLAKE"},
+ # # {:char=>"❆", :codepoint=>10054, :name=>"HEAVY CHEVRON SNOWFLAKE"}]
+ # Unisec::Rugrep.regrep('greek small letter \w+')
+ # # =>
+ # # [{:char=>"ͱ", :codepoint=>881, :name=>"GREEK SMALL LETTER HETA"},
+ # # {:char=>"ͳ", :codepoint=>883, :name=>"GREEK SMALL LETTER ARCHAIC SAMPI"},
+ # # {:char=>"ͷ", :codepoint=>887, :name=>"GREEK SMALL LETTER PAMPHYLIAN DIGAMMA"},
+ # # …]
+ def self.regrep(regexp)
+ out = []
+ file = File.new(UCD_DERIVEDNAME)
+ file.each_line(chomp: true) do |line|
+ # Skip if the line is empty or a comment
+ next if line.empty? || line[0] == '#'
+
+ # parse the line to extract code point as integer and the name
+ cp_int, name = line.split(';')
+ cp_int = cp_int.chomp.to_i(16)
+ name.lstrip!
+ next unless /#{regexp}/i.match?(name) # compiling regexp once is surprisingly not faster
+
+ out << {
+ char: TwitterCldr::Utils::CodePoints.to_string([cp_int]),
+ codepoint: cp_int,
+ name: name
+ }
+ end
+ out
+ end
+
+ # Display a CLI-friendly output listing all code points corresponding to a regular expression.
+ # @example
+ # Unisec::Rugrep.regrep_display('snowman|snowflake')
+ # # =>
+ # # U+2603 ☃ SNOWMAN
+ # # U+26C4 ⛄ SNOWMAN WITHOUT SNOW
+ # # U+26C7 ⛇ BLACK SNOWMAN
+ # # U+2744 ❄ SNOWFLAKE
+ # # U+2745 ❅ TIGHT TRIFOLIATE SNOWFLAKE
+ # # U+2746 ❆ HEAVY CHEVRON SNOWFLAKE
+ def self.regrep_display(regexp)
+ codepoints = regrep(regexp)
+ codepoints.each do |cp|
+ puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
+ end
+ nil
+ end
+
+ # Returns the version of Unicode used in UCD local file (data/DerivedName.txt)
+ # @return [String] Unicode version
+ # @example
+ # Unisec::Rugrep.ucd_derivedname_version # => "15.1.0"
+ def self.ucd_derivedname_version
+ first_line = File.open(UCD_DERIVEDNAME, &:readline)
+ first_line.match(/-(\d+\.\d+\.\d+)\.txt/).captures.first
+ end
+
+ # Search code points by (Ruby) regexp
+ # @param regexp [Regexp] Regular expression without delimiters or modifiers
+ # @return [Array] Array of code points (`{char: String, codepoint: Integer, name: String}`)
+ # @example
+ # Unisec::Rugrep.regrep_slow('snowman|snowflake')
+ # # =>
+ # # [{:char=>"☃", :codepoint=>9731, :name=>"SNOWMAN"},
+ # # {:char=>"⛄", :codepoint=>9924, :name=>"SNOWMAN WITHOUT SNOW"},
+ # # {:char=>"⛇", :codepoint=>9927, :name=>"BLACK SNOWMAN"},
+ # # {:char=>"❄", :codepoint=>10052, :name=>"SNOWFLAKE"},
+ # # {:char=>"❅", :codepoint=>10053, :name=>"TIGHT TRIFOLIATE SNOWFLAKE"},
+ # # {:char=>"❆", :codepoint=>10054, :name=>"HEAVY CHEVRON SNOWFLAKE"}]
+ # @note ⚠ This command is very time consuming (~ 1min) and unoptimized (execute one regexp per code point…)
+ def self.regrep_slow(regexp)
+ out = []
+ TwitterCldr::Shared::CodePoint.each do |cp|
+ next unless /#{regexp}/oi.match?(cp.name) # compiling regexp once is surprisingly not faster
+
+ out << {
+ char: TwitterCldr::Utils::CodePoints.to_string([cp.code_point]),
+ codepoint: cp.code_point,
+ name: cp.name
+ }
+ end
+ out
+ end
+
+ # Display a CLI-friendly output listing all code points corresponding to a regular expression.
+ # @example
+ # Unisec::Rugrep.regrep_display_slow('snowman|snowflake')
+ # # =>
+ # # U+2603 ☃ SNOWMAN
+ # # U+26C4 ⛄ SNOWMAN WITHOUT SNOW
+ # # U+26C7 ⛇ BLACK SNOWMAN
+ # # U+2744 ❄ SNOWFLAKE
+ # # U+2745 ❅ TIGHT TRIFOLIATE SNOWFLAKE
+ # # U+2746 ❆ HEAVY CHEVRON SNOWFLAKE
+ def self.regrep_display_slow(regexp)
+ codepoints = regrep_slow(regexp)
+ codepoints.each do |cp|
+ puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
+ end
+ nil
+ end
+ end
+end
diff --git a/lib/unisec/version.rb b/lib/unisec/version.rb
index 7ebdd71..02387ae 100644
--- a/lib/unisec/version.rb
+++ b/lib/unisec/version.rb
@@ -2,5 +2,5 @@
module Unisec
# Version of unisec library and app
- VERSION = '0.0.2'
+ VERSION = '0.0.3'
end
diff --git a/lib/unisec/versions.rb b/lib/unisec/versions.rb
index b51e732..98c8ae9 100644
--- a/lib/unisec/versions.rb
+++ b/lib/unisec/versions.rb
@@ -51,6 +51,10 @@ def self.versions # rubocop:disable Metrics/MethodLength
unicodeconfusable_unicode: {
version: Unicode::Confusable::UNICODE_VERSION,
label: 'Unicode (unicode-confusable gem)'
+ },
+ ucd_derivedname: {
+ version: Unisec::Rugrep.ucd_derivedname_version,
+ label: 'UCD (data/DerivedName.txt)'
}
}
end
@@ -76,6 +80,7 @@ def self.display # rubocop:disable Metrics/AbcSize
display.call(:twittercldr_icu)
display.call(:twittercldr_cldr)
display.call(:ruby_unicode_emoji)
+ display.call(:ucd_derivedname)
puts Paint["\nGems:", :underline]
display.call(:unisec)
display.call(:twittercldr)
diff --git a/test/test_properties.rb b/test/test_properties.rb
index d6a52bd..e1b69b6 100644
--- a/test/test_properties.rb
+++ b/test/test_properties.rb
@@ -13,24 +13,24 @@ def test_unisec_properties_codepoints
cps = Unisec::Properties.codepoints('Quotation_Mark')
assert_kind_of(Array, cps)
assert_kind_of(Hash, cps.first)
- assert_equal(true, cps.first.has_key?(:char))
- assert_equal(true, cps.first.has_key?(:codepoint))
- assert_equal(true, cps.first.has_key?(:name))
+ assert(cps.first.has_key?(:char))
+ assert(cps.first.has_key?(:codepoint))
+ assert(cps.first.has_key?(:name))
end
def test_unisec_properties_char
data = Unisec::Properties.char('é')
assert_kind_of(Hash, data)
- assert_equal(true, data.has_key?(:age))
- assert_equal(true, data.has_key?(:block))
- assert_equal(true, data.has_key?(:category))
- assert_equal(true, data.has_key?(:subcategory))
- assert_equal(true, data.has_key?(:codepoint))
- assert_equal(true, data.has_key?(:name))
- assert_equal(true, data.has_key?(:script))
- assert_equal(true, data.has_key?(:case))
- assert_equal(true, data.has_key?(:normalization))
- assert_equal(true, data.has_key?(:other_properties))
+ assert(data.has_key?(:age))
+ assert(data.has_key?(:block))
+ assert(data.has_key?(:category))
+ assert(data.has_key?(:subcategory))
+ assert(data.has_key?(:codepoint))
+ assert(data.has_key?(:name))
+ assert(data.has_key?(:script))
+ assert(data.has_key?(:case))
+ assert(data.has_key?(:normalization))
+ assert(data.has_key?(:other_properties))
assert_equal('LATIN SMALL LETTER E WITH ACUTE', data[:name])
assert_equal('U+00E9', data[:codepoint])
end
@@ -44,4 +44,9 @@ def test_unisec_properties_chars2codepoints
assert_equal('U+00E9', Unisec::Properties.chars2codepoints('é'))
assert_equal('U+0041 U+005A', Unisec::Properties.chars2codepoints('AZ'))
end
+
+ def test_unisec_properties_deccp2stdhexcp
+ assert_equal('U+1F680', Unisec::Properties.deccp2stdhexcp(128640))
+ assert_equal('U+0020', Unisec::Properties.deccp2stdhexcp(32))
+ end
end
diff --git a/test/test_rugrep.rb b/test/test_rugrep.rb
new file mode 100644
index 0000000..0bf1b5f
--- /dev/null
+++ b/test/test_rugrep.rb
@@ -0,0 +1,24 @@
+# frozen_string_literal: false
+
+require 'minitest/autorun'
+require 'unisec'
+
+class UnisecTest < Minitest::Test
+ def test_unisec_rugrep_regrep
+ search = Unisec::Rugrep.regrep('large \w+ square')
+ assert_kind_of(Array, search)
+ assert(search.first.has_key?(:char))
+ assert(search.first.has_key?(:codepoint))
+ assert(search.first.has_key?(:name))
+ assert_kind_of(String, search.first[:char])
+ assert_kind_of(Integer, search.first[:codepoint])
+ assert_kind_of(String, search.first[:name])
+ search2 = Unisec::Rugrep.regrep('azerty')
+ assert_kind_of(Array, search2)
+ assert_empty(search2)
+ end
+
+ def test_unisec_rugrep_ucd_derivedname_version
+ assert(/\A\d+\.\d+\.\d+\Z/.match?(Unisec::Rugrep.ucd_derivedname_version))
+ end
+end
diff --git a/unisec.gemspec b/unisec.gemspec
index 2c82585..81fdb9c 100644
--- a/unisec.gemspec
+++ b/unisec.gemspec
@@ -7,13 +7,15 @@ Gem::Specification.new do |s|
s.version = Unisec::VERSION
s.platform = Gem::Platform::RUBY
s.summary = 'Unicode Security Toolkit'
- s.description = 'Toolkit for security research manipulating Unicode'
+ s.description = 'Toolkit for security research manipulating Unicode: '
+ s.description += 'confusables, homoglyphs, hexdump, code point, UTF-8, UTF-16, UTF-32, properties, regexp search, '
+ s.description += 'size, grapheme, surrogates, version, ICU, CLDR, UCD'
s.authors = ['Alexandre ZANNI']
s.email = 'alexandre.zanni@europe.com'
s.homepage = 'https://github.com/Acceis/unisec'
s.license = 'MIT'
- s.files = Dir['bin/*'] + Dir['lib/**/*.rb'] + ['LICENSE']
+ s.files = Dir['bin/*', 'lib/**/*.rb', 'data/*', 'LICENSE']
s.bindir = 'bin'
s.executables = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
s.require_paths = ['lib']
|