2060-io · DanielFRico · Nov 7, 2024 · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024
diff --git a/android/src/main/java/com/visioncameratextrecognition/BitmapUtils.kt b/android/src/main/java/com/visioncameratextrecognition/BitmapUtils.kt
@@ -0,0 +1,202 @@
+package com.visioncameratextrecognition
+
+import android.graphics.Bitmap
+import android.graphics.BitmapFactory
+import android.graphics.ImageFormat
+import android.graphics.Matrix
+import android.graphics.Rect
+import android.graphics.YuvImage
+import android.media.Image.Plane
+import android.util.Log
+import com.mrousavy.camera.core.FrameInvalidError
+import com.mrousavy.camera.core.types.Orientation
+import com.mrousavy.camera.frameprocessors.Frame
+import java.io.ByteArrayOutputStream
+import java.nio.ByteBuffer
+
+object BitmapUtils {
+
+    /** Converts NV21 format byte buffer to bitmap.  */
+    private fun getBitmap(data: ByteBuffer, metadata: FrameMetadata): Bitmap? {
+        data.rewind()
+        val imageInBuffer = ByteArray(data.limit())
+        data[imageInBuffer, 0, imageInBuffer.size]
+        try {
+            val image =
+                YuvImage(
+                    imageInBuffer, ImageFormat.NV21, metadata.width, metadata.height, null
+                )
+            val stream = ByteArrayOutputStream()
+            image.compressToJpeg(Rect(0, 0, metadata.width, metadata.height), 80, stream)
+
+            val bmp = BitmapFactory.decodeByteArray(stream.toByteArray(), 0, stream.size())
+
+            stream.close()
+            return rotateBitmap(bmp, metadata.rotation, false, false)
+        } catch (e: Exception) {
+            Log.e("VisionProcessorBase", "Error: " + e.message)
+        }
+        return null
+    }
+
+    /** Converts a YUV_420_888 image from Vision Camera API to a bitmap.  */
+    @Throws(FrameInvalidError::class)
+    fun getBitmap(image: Frame): Bitmap? {
+        val frameMetadata =
+            FrameMetadata.Builder()
+                .setWidth(image.width)
+                .setHeight(image.height)
+                .setRotation(getRotationDegreeFromOrientation(image.orientation))
+                .build()
+
+        val nv21Buffer =
+            yuv420ThreePlanesToNV21(image.image.planes, image.width, image.height)
+        return getBitmap(nv21Buffer, frameMetadata)
+    }
+
+    private fun getRotationDegreeFromOrientation(orientation: Orientation): Int {
+        if (orientation.name == Orientation.PORTRAIT.name) {
+            return 0
+        } else if (orientation.name == Orientation.LANDSCAPE_LEFT.name) {
+            return 270
+        } else if (orientation.name == Orientation.LANDSCAPE_RIGHT.name) {
+            return 90
+        } else if (orientation.name == Orientation.PORTRAIT_UPSIDE_DOWN.name) {
+            return 180
+        }
+        return 0
+    }
+
+    /** Rotates a bitmap if it is converted from a bytebuffer.  */
+    private fun rotateBitmap(
+        bitmap: Bitmap, rotationDegrees: Int, flipX: Boolean, flipY: Boolean
+    ): Bitmap {
+        val matrix = Matrix()
+
+        // Rotate the image back to straight.
+        matrix.postRotate(rotationDegrees.toFloat())
+
+        // Mirror the image along the X or Y axis.
+        matrix.postScale(if (flipX) -1.0f else 1.0f, if (flipY) -1.0f else 1.0f)
+        val rotatedBitmap =
+            Bitmap.createBitmap(bitmap, 0, 0, bitmap.width, bitmap.height, matrix, true)
+
+        // Recycle the old bitmap if it has changed.
+        if (rotatedBitmap != bitmap) {
+            bitmap.recycle()
+        }
+        return rotatedBitmap
+    }
+
+    /**
+     * Converts YUV_420_888 to NV21 bytebuffer.
+     *
+     *
+     * The NV21 format consists of a single byte array containing the Y, U and V values. For an
+     * image of size S, the first S positions of the array contain all the Y values. The remaining
+     * positions contain interleaved V and U values. U and V are subsampled by a factor of 2 in both
+     * dimensions, so there are S/4 U values and S/4 V values. In summary, the NV21 array will contain
+     * S Y values followed by S/4 VU values: YYYYYYYYYYYYYY(...)YVUVUVUVU(...)VU
+     *
+     *
+     * YUV_420_888 is a generic format that can describe any YUV image where U and V are subsampled
+     * by a factor of 2 in both dimensions. [Image.getPlanes] returns an array with the Y, U and
+     * V planes. The Y plane is guaranteed not to be interleaved, so we can just copy its values into
+     * the first part of the NV21 array. The U and V planes may already have the representation in the
+     * NV21 format. This happens if the planes share the same buffer, the V buffer is one position
+     * before the U buffer and the planes have a pixelStride of 2. If this is case, we can just copy
+     * them to the NV21 array.
+     */
+    private fun yuv420ThreePlanesToNV21(
+        yuv420888planes: Array<Plane>, width: Int, height: Int
+    ): ByteBuffer {
+        val imageSize = width * height
+        val out = ByteArray(imageSize + 2 * (imageSize / 4))
+
+        if (areUVPlanesNV21(yuv420888planes, width, height)) {
+            // Copy the Y values.
+            yuv420888planes[0].buffer[out, 0, imageSize]
+
+            val uBuffer = yuv420888planes[1].buffer
+            val vBuffer = yuv420888planes[2].buffer
+            // Get the first V value from the V buffer, since the U buffer does not contain it.
+            vBuffer[out, imageSize, 1]
+            // Copy the first U value and the remaining VU values from the U buffer.
+            uBuffer[out, imageSize + 1, 2 * imageSize / 4 - 1]
+        } else {
+            // Fallback to copying the UV values one by one, which is slower but also works.
+            // Unpack Y.
+            unpackPlane(yuv420888planes[0], width, height, out, 0, 1)
+            // Unpack U.
+            unpackPlane(yuv420888planes[1], width, height, out, imageSize + 1, 2)
+            // Unpack V.
+            unpackPlane(yuv420888planes[2], width, height, out, imageSize, 2)
+        }
+
+        return ByteBuffer.wrap(out)
+    }
+
+    /** Checks if the UV plane buffers of a YUV_420_888 image are in the NV21 format.  */
+    private fun areUVPlanesNV21(planes: Array<Plane>, width: Int, height: Int): Boolean {
+        val imageSize = width * height
+
+        val uBuffer = planes[1].buffer
+        val vBuffer = planes[2].buffer
+
+        // Backup buffer properties.
+        val vBufferPosition = vBuffer.position()
+        val uBufferLimit = uBuffer.limit()
+
+        // Advance the V buffer by 1 byte, since the U buffer will not contain the first V value.
+        vBuffer.position(vBufferPosition + 1)
+        // Chop off the last byte of the U buffer, since the V buffer will not contain the last U value.
+        uBuffer.limit(uBufferLimit - 1)
+
+        // Check that the buffers are equal and have the expected number of elements.
+        val areNV21 =
+            (vBuffer.remaining() == (2 * imageSize / 4 - 2)) && (vBuffer.compareTo(uBuffer) == 0)
+
+        // Restore buffers to their initial state.
+        vBuffer.position(vBufferPosition)
+        uBuffer.limit(uBufferLimit)
+
+        return areNV21
+    }
+
+    /**
+     * Unpack an image plane into a byte array.
+     *
+     *
+     * The input plane data will be copied in 'out', starting at 'offset' and every pixel will be
+     * spaced by 'pixelStride'. Note that there is no row padding on the output.
+     */
+    private fun unpackPlane(
+        plane: Plane, width: Int, height: Int, out: ByteArray, offset: Int, pixelStride: Int
+    ) {
+        val buffer = plane.buffer
+        buffer.rewind()
+
+        // Compute the size of the current plane.
+        // We assume that it has the aspect ratio as the original image.
+        val numRow = (buffer.limit() + plane.rowStride - 1) / plane.rowStride
+        if (numRow == 0) {
+            return
+        }
+        val scaleFactor = height / numRow
+        val numCol = width / scaleFactor
+
+        // Extract the data in the output buffer.
+        var outputPos = offset
+        var rowStart = 0
+        for (row in 0 until numRow) {
+            var inputPos = rowStart
+            for (col in 0 until numCol) {
+                out[outputPos] = buffer[inputPos]
+                outputPos += pixelStride
+                inputPos += plane.pixelStride
+            }
+            rowStart += plane.rowStride
+        }
+    }
+}
+
diff --git a/android/src/main/java/com/visioncameratextrecognition/FrameMetadata.kt b/android/src/main/java/com/visioncameratextrecognition/FrameMetadata.kt
@@ -0,0 +1,29 @@
+package com.visioncameratextrecognition
+
+class FrameMetadata private constructor(@JvmField val width: Int, @JvmField val height: Int, @JvmField val rotation: Int) {
+    /** Builder of [FrameMetadata].  */
+    class Builder {
+        private var width = 0
+        private var height = 0
+        private var rotation = 0
+
+        fun setWidth(width: Int): Builder {
+            this.width = width
+            return this
+        }
+
+        fun setHeight(height: Int): Builder {
+            this.height = height
+            return this
+        }
+
+        fun setRotation(rotation: Int): Builder {
+            this.rotation = rotation
+            return this
+        }
+
+        fun build(): FrameMetadata {
+            return FrameMetadata(width, height, rotation)
+        }
+    }
+}
diff --git a/android/src/main/java/com/visioncameratextrecognition/VisionCameraTextRecognitionPlugin.kt b/android/src/main/java/com/visioncameratextrecognition/VisionCameraTextRecognitionPlugin.kt
@@ -1,5 +1,6 @@
 package com.visioncameratextrecognition
 
+import android.graphics.Bitmap
 import android.graphics.Point
 import android.graphics.Rect
 import android.media.Image
@@ -18,12 +19,12 @@ import com.google.mlkit.vision.text.latin.TextRecognizerOptions
 import com.mrousavy.camera.frameprocessors.Frame
 import com.mrousavy.camera.frameprocessors.FrameProcessorPlugin
 import com.mrousavy.camera.frameprocessors.VisionCameraProxy
-import java.util.HashMap
 
 class VisionCameraTextRecognitionPlugin(proxy: VisionCameraProxy, options: Map<String, Any>?) :
     FrameProcessorPlugin() {
 
     private var recognizer = TextRecognition.getClient(TextRecognizerOptions.DEFAULT_OPTIONS)
+    private var scanRegion: Map<*, *>? = null
     private val latinOptions = TextRecognizerOptions.DEFAULT_OPTIONS
     private val chineseOptions = ChineseTextRecognizerOptions.Builder().build()
     private val devanagariOptions = DevanagariTextRecognizerOptions.Builder().build()
@@ -32,6 +33,7 @@ class VisionCameraTextRecognitionPlugin(proxy: VisionCameraProxy, options: Map<S
 
     init {
         val language = options?.get("language").toString()
+        scanRegion =  options?.get("scanRegion") as Map<*, *>?
         recognizer = when (language) {
             "latin" -> TextRecognition.getClient(latinOptions)
             "chinese" -> TextRecognition.getClient(chineseOptions)
@@ -44,9 +46,28 @@ class VisionCameraTextRecognitionPlugin(proxy: VisionCameraProxy, options: Map<S
 
     override fun callback(frame: Frame, arguments: Map<String, Any>?): HashMap<String, Any>? {
         val data = WritableNativeMap()
-        val mediaImage: Image = frame.image
-        val image =
-            InputImage.fromMediaImage(mediaImage, frame.imageProxy.imageInfo.rotationDegrees)
+        var image: InputImage? = null
+        if (scanRegion != null) {
+            var bm: Bitmap? = BitmapUtils.getBitmap(frame)
+            if (bm === null) return null
+            val left = (scanRegion!!["left"] as Double) / 100.0 * bm.width
+            val top = (scanRegion!!["top"] as Double) / 100.0 * bm.height
+            val width = (scanRegion!!["width"] as Double) / 100.0 * bm.width
+            val height = (scanRegion!!["height"] as Double) / 100.0 * bm.height
+            bm = Bitmap.createBitmap(
+                bm,
+                left.toInt(),
+                top.toInt(),
+                width.toInt(),
+                height.toInt(),
+                null,
+                false
+            )
+            image = InputImage.fromBitmap(bm,frame.imageProxy.imageInfo.rotationDegrees);
+        } else {
+            val mediaImage: Image = frame.image
+            image = InputImage.fromMediaImage(mediaImage, frame.imageProxy.imageInfo.rotationDegrees)
+        }
         val task: Task<Text> = recognizer.process(image)
         try {
             val text: Text = Tasks.await(task)

diff --git a/ios/VisionCameraTextRecognition.swift b/ios/VisionCameraTextRecognition.swift
@@ -12,6 +12,7 @@ import MLKitCommon
 public class VisionCameraTextRecognition: FrameProcessorPlugin {
 
     private var textRecognizer = TextRecognizer()
+    private var scanRegion: [String: Int]? = nil
     private static let latinOptions = TextRecognizerOptions()
     private static let chineseOptions = ChineseTextRecognizerOptions()
     private static let devanagariOptions = DevanagariTextRecognizerOptions()
@@ -23,6 +24,7 @@ public class VisionCameraTextRecognition: FrameProcessorPlugin {
     public override init(proxy: VisionCameraProxyHolder, options: [AnyHashable: Any]! = [:]) {
         super.init(proxy: proxy, options: options)
         let language = options["language"] as? String ?? "latin"
+        scanRegion = options["scanRegion"] as? [String: Int]
         switch language {
         case "chinese":
             self.textRecognizer = TextRecognizer.textRecognizer(options: VisionCameraTextRecognition.chineseOptions)
@@ -40,11 +42,39 @@ public class VisionCameraTextRecognition: FrameProcessorPlugin {
 
     public override func callback(_ frame: Frame, withArguments arguments: [AnyHashable: Any]?) -> Any {
         let buffer = frame.buffer
-        let image = VisionImage(buffer: buffer)
-        image.orientation = getOrientation(orientation: frame.orientation)
-
+        var image: VisionImage?;
         do {
-            let result = try self.textRecognizer.results(in: image)
+            if scanRegion != nil {
+                guard let pixelBuffer = CMSampleBufferGetImageBuffer(buffer) else {
+                    return [:]
+                }
+                let ciImage = CIImage(cvPixelBuffer: pixelBuffer).oriented(.right)
+                let context = CIContext(options: nil)
+                if let cgImage = context.createCGImage(ciImage, from: ciImage.extent) {
+                    let imgWidth = Double(cgImage.width)
+                    let imgHeight = Double(cgImage.height)
+                    let left:Double = Double(scanRegion?["left"] ?? 0) / 100.0 * imgWidth
+                    let top:Double = Double(scanRegion?["top"] ?? 0) / 100.0 * imgHeight
+                    let width:Double = Double(scanRegion?["width"] ?? 100) / 100.0 * imgWidth
+                    let height:Double = Double(scanRegion?["height"] ?? 100) / 100.0 * imgHeight
+                    let cropRegion = CGRect(
+                        x: left,
+                        y: top,
+                        width: width,
+                        height: height
+                    )
+                    guard let croppedCGImage = cgImage.cropping(to: cropRegion) else {
+                        return [:]
+                    }
+                    let uiImage = UIImage(cgImage: croppedCGImage)
+                    image = VisionImage(image: uiImage)
+                    print("using cropped image")
+                }
+            }else{
+                image = VisionImage(buffer: buffer)
+                image!.orientation = getOrientation(orientation: frame.orientation)
+            }
+            let result = try self.textRecognizer.results(in: image!)
             let blocks = VisionCameraTextRecognition.processBlocks(blocks: result.blocks)
             data["resultText"] = result.text
             data["blocks"] = blocks

diff --git a/src/Camera.tsx b/src/Camera.tsx
@@ -44,7 +44,7 @@ export const Camera = forwardRef(function Camera(
   const frameProcessor: ReadonlyFrameProcessor = useFrameProcessor(
     (frame: Frame) => {
       'worklet';
-      const data: Text[] | string = plugin(frame);
+      const data: Text | string = plugin(frame);
       // @ts-ignore
       useWorklets(data);
     },

diff --git a/src/scanText.ts b/src/scanText.ts
@@ -18,10 +18,10 @@ export function createTextRecognitionPlugin(
     throw new Error(LINKING_ERROR);
   }
   return {
-    scanText: (frame: Frame): Text[] => {
+    scanText: (frame: Frame): Text => {
       'worklet';
       // @ts-ignore
-      return plugin.call(frame) as Text[];
+      return plugin.call(frame) as Text;
     },
   };
 }