Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add scan region #1

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 202 additions & 0 deletions android/src/main/java/com/visioncameratextrecognition/BitmapUtils.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
package com.visioncameratextrecognition

import android.graphics.Bitmap
import android.graphics.BitmapFactory
import android.graphics.ImageFormat
import android.graphics.Matrix
import android.graphics.Rect
import android.graphics.YuvImage
import android.media.Image.Plane
import android.util.Log
import com.mrousavy.camera.core.FrameInvalidError
import com.mrousavy.camera.core.types.Orientation
import com.mrousavy.camera.frameprocessors.Frame
import java.io.ByteArrayOutputStream
import java.nio.ByteBuffer

object BitmapUtils {

/** Converts NV21 format byte buffer to bitmap. */
private fun getBitmap(data: ByteBuffer, metadata: FrameMetadata): Bitmap? {
data.rewind()
val imageInBuffer = ByteArray(data.limit())
data[imageInBuffer, 0, imageInBuffer.size]
try {
val image =
YuvImage(
imageInBuffer, ImageFormat.NV21, metadata.width, metadata.height, null
)
val stream = ByteArrayOutputStream()
image.compressToJpeg(Rect(0, 0, metadata.width, metadata.height), 80, stream)

val bmp = BitmapFactory.decodeByteArray(stream.toByteArray(), 0, stream.size())

stream.close()
return rotateBitmap(bmp, metadata.rotation, false, false)
} catch (e: Exception) {
Log.e("VisionProcessorBase", "Error: " + e.message)
}
return null
}

/** Converts a YUV_420_888 image from Vision Camera API to a bitmap. */
@Throws(FrameInvalidError::class)
fun getBitmap(image: Frame): Bitmap? {
val frameMetadata =
FrameMetadata.Builder()
.setWidth(image.width)
.setHeight(image.height)
.setRotation(getRotationDegreeFromOrientation(image.orientation))
.build()

val nv21Buffer =
yuv420ThreePlanesToNV21(image.image.planes, image.width, image.height)
return getBitmap(nv21Buffer, frameMetadata)
}

private fun getRotationDegreeFromOrientation(orientation: Orientation): Int {
if (orientation.name == Orientation.PORTRAIT.name) {
return 0
} else if (orientation.name == Orientation.LANDSCAPE_LEFT.name) {
return 270
} else if (orientation.name == Orientation.LANDSCAPE_RIGHT.name) {
return 90
} else if (orientation.name == Orientation.PORTRAIT_UPSIDE_DOWN.name) {
return 180
}
return 0
}

/** Rotates a bitmap if it is converted from a bytebuffer. */
private fun rotateBitmap(
bitmap: Bitmap, rotationDegrees: Int, flipX: Boolean, flipY: Boolean
): Bitmap {
val matrix = Matrix()

// Rotate the image back to straight.
matrix.postRotate(rotationDegrees.toFloat())

// Mirror the image along the X or Y axis.
matrix.postScale(if (flipX) -1.0f else 1.0f, if (flipY) -1.0f else 1.0f)
val rotatedBitmap =
Bitmap.createBitmap(bitmap, 0, 0, bitmap.width, bitmap.height, matrix, true)

// Recycle the old bitmap if it has changed.
if (rotatedBitmap != bitmap) {
bitmap.recycle()
}
return rotatedBitmap
}

/**
* Converts YUV_420_888 to NV21 bytebuffer.
*
*
* The NV21 format consists of a single byte array containing the Y, U and V values. For an
* image of size S, the first S positions of the array contain all the Y values. The remaining
* positions contain interleaved V and U values. U and V are subsampled by a factor of 2 in both
* dimensions, so there are S/4 U values and S/4 V values. In summary, the NV21 array will contain
* S Y values followed by S/4 VU values: YYYYYYYYYYYYYY(...)YVUVUVUVU(...)VU
*
*
* YUV_420_888 is a generic format that can describe any YUV image where U and V are subsampled
* by a factor of 2 in both dimensions. [Image.getPlanes] returns an array with the Y, U and
* V planes. The Y plane is guaranteed not to be interleaved, so we can just copy its values into
* the first part of the NV21 array. The U and V planes may already have the representation in the
* NV21 format. This happens if the planes share the same buffer, the V buffer is one position
* before the U buffer and the planes have a pixelStride of 2. If this is case, we can just copy
* them to the NV21 array.
*/
private fun yuv420ThreePlanesToNV21(
yuv420888planes: Array<Plane>, width: Int, height: Int
): ByteBuffer {
val imageSize = width * height
val out = ByteArray(imageSize + 2 * (imageSize / 4))

if (areUVPlanesNV21(yuv420888planes, width, height)) {
// Copy the Y values.
yuv420888planes[0].buffer[out, 0, imageSize]

val uBuffer = yuv420888planes[1].buffer
val vBuffer = yuv420888planes[2].buffer
// Get the first V value from the V buffer, since the U buffer does not contain it.
vBuffer[out, imageSize, 1]
// Copy the first U value and the remaining VU values from the U buffer.
uBuffer[out, imageSize + 1, 2 * imageSize / 4 - 1]
} else {
// Fallback to copying the UV values one by one, which is slower but also works.
// Unpack Y.
unpackPlane(yuv420888planes[0], width, height, out, 0, 1)
// Unpack U.
unpackPlane(yuv420888planes[1], width, height, out, imageSize + 1, 2)
// Unpack V.
unpackPlane(yuv420888planes[2], width, height, out, imageSize, 2)
}

return ByteBuffer.wrap(out)
}

/** Checks if the UV plane buffers of a YUV_420_888 image are in the NV21 format. */
private fun areUVPlanesNV21(planes: Array<Plane>, width: Int, height: Int): Boolean {
val imageSize = width * height

val uBuffer = planes[1].buffer
val vBuffer = planes[2].buffer

// Backup buffer properties.
val vBufferPosition = vBuffer.position()
val uBufferLimit = uBuffer.limit()

// Advance the V buffer by 1 byte, since the U buffer will not contain the first V value.
vBuffer.position(vBufferPosition + 1)
// Chop off the last byte of the U buffer, since the V buffer will not contain the last U value.
uBuffer.limit(uBufferLimit - 1)

// Check that the buffers are equal and have the expected number of elements.
val areNV21 =
(vBuffer.remaining() == (2 * imageSize / 4 - 2)) && (vBuffer.compareTo(uBuffer) == 0)

// Restore buffers to their initial state.
vBuffer.position(vBufferPosition)
uBuffer.limit(uBufferLimit)

return areNV21
}

/**
* Unpack an image plane into a byte array.
*
*
* The input plane data will be copied in 'out', starting at 'offset' and every pixel will be
* spaced by 'pixelStride'. Note that there is no row padding on the output.
*/
private fun unpackPlane(
plane: Plane, width: Int, height: Int, out: ByteArray, offset: Int, pixelStride: Int
) {
val buffer = plane.buffer
buffer.rewind()

// Compute the size of the current plane.
// We assume that it has the aspect ratio as the original image.
val numRow = (buffer.limit() + plane.rowStride - 1) / plane.rowStride
if (numRow == 0) {
return
}
val scaleFactor = height / numRow
val numCol = width / scaleFactor

// Extract the data in the output buffer.
var outputPos = offset
var rowStart = 0
for (row in 0 until numRow) {
var inputPos = rowStart
for (col in 0 until numCol) {
out[outputPos] = buffer[inputPos]
outputPos += pixelStride
inputPos += plane.pixelStride
}
rowStart += plane.rowStride
}
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package com.visioncameratextrecognition

class FrameMetadata private constructor(@JvmField val width: Int, @JvmField val height: Int, @JvmField val rotation: Int) {
/** Builder of [FrameMetadata]. */
class Builder {
private var width = 0
private var height = 0
private var rotation = 0

fun setWidth(width: Int): Builder {
this.width = width
return this
}

fun setHeight(height: Int): Builder {
this.height = height
return this
}

fun setRotation(rotation: Int): Builder {
this.rotation = rotation
return this
}

fun build(): FrameMetadata {
return FrameMetadata(width, height, rotation)
}
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.visioncameratextrecognition

import android.graphics.Bitmap
import android.graphics.Point
import android.graphics.Rect
import android.media.Image
Expand All @@ -18,12 +19,12 @@ import com.google.mlkit.vision.text.latin.TextRecognizerOptions
import com.mrousavy.camera.frameprocessors.Frame
import com.mrousavy.camera.frameprocessors.FrameProcessorPlugin
import com.mrousavy.camera.frameprocessors.VisionCameraProxy
import java.util.HashMap

class VisionCameraTextRecognitionPlugin(proxy: VisionCameraProxy, options: Map<String, Any>?) :
FrameProcessorPlugin() {

private var recognizer = TextRecognition.getClient(TextRecognizerOptions.DEFAULT_OPTIONS)
private var scanRegion: Map<*, *>? = null
private val latinOptions = TextRecognizerOptions.DEFAULT_OPTIONS
private val chineseOptions = ChineseTextRecognizerOptions.Builder().build()
private val devanagariOptions = DevanagariTextRecognizerOptions.Builder().build()
Expand All @@ -32,6 +33,7 @@ class VisionCameraTextRecognitionPlugin(proxy: VisionCameraProxy, options: Map<S

init {
val language = options?.get("language").toString()
scanRegion = options?.get("scanRegion") as Map<*, *>?
recognizer = when (language) {
"latin" -> TextRecognition.getClient(latinOptions)
"chinese" -> TextRecognition.getClient(chineseOptions)
Expand All @@ -44,9 +46,28 @@ class VisionCameraTextRecognitionPlugin(proxy: VisionCameraProxy, options: Map<S

override fun callback(frame: Frame, arguments: Map<String, Any>?): HashMap<String, Any>? {
val data = WritableNativeMap()
val mediaImage: Image = frame.image
val image =
InputImage.fromMediaImage(mediaImage, frame.imageProxy.imageInfo.rotationDegrees)
var image: InputImage? = null
if (scanRegion != null) {
var bm: Bitmap? = BitmapUtils.getBitmap(frame)
if (bm === null) return null
val left = (scanRegion!!["left"] as Double) / 100.0 * bm.width
val top = (scanRegion!!["top"] as Double) / 100.0 * bm.height
val width = (scanRegion!!["width"] as Double) / 100.0 * bm.width
val height = (scanRegion!!["height"] as Double) / 100.0 * bm.height
bm = Bitmap.createBitmap(
bm,
left.toInt(),
top.toInt(),
width.toInt(),
height.toInt(),
null,
false
)
image = InputImage.fromBitmap(bm,frame.imageProxy.imageInfo.rotationDegrees);
} else {
val mediaImage: Image = frame.image
image = InputImage.fromMediaImage(mediaImage, frame.imageProxy.imageInfo.rotationDegrees)
}
val task: Task<Text> = recognizer.process(image)
try {
val text: Text = Tasks.await(task)
Expand Down
38 changes: 34 additions & 4 deletions ios/VisionCameraTextRecognition.swift
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import MLKitCommon
public class VisionCameraTextRecognition: FrameProcessorPlugin {

private var textRecognizer = TextRecognizer()
private var scanRegion: [String: Int]? = nil
private static let latinOptions = TextRecognizerOptions()
private static let chineseOptions = ChineseTextRecognizerOptions()
private static let devanagariOptions = DevanagariTextRecognizerOptions()
Expand All @@ -23,6 +24,7 @@ public class VisionCameraTextRecognition: FrameProcessorPlugin {
public override init(proxy: VisionCameraProxyHolder, options: [AnyHashable: Any]! = [:]) {
super.init(proxy: proxy, options: options)
let language = options["language"] as? String ?? "latin"
scanRegion = options["scanRegion"] as? [String: Int]
switch language {
case "chinese":
self.textRecognizer = TextRecognizer.textRecognizer(options: VisionCameraTextRecognition.chineseOptions)
Expand All @@ -40,11 +42,39 @@ public class VisionCameraTextRecognition: FrameProcessorPlugin {

public override func callback(_ frame: Frame, withArguments arguments: [AnyHashable: Any]?) -> Any {
let buffer = frame.buffer
let image = VisionImage(buffer: buffer)
image.orientation = getOrientation(orientation: frame.orientation)

var image: VisionImage?;
do {
let result = try self.textRecognizer.results(in: image)
if scanRegion != nil {
guard let pixelBuffer = CMSampleBufferGetImageBuffer(buffer) else {
return [:]
}
let ciImage = CIImage(cvPixelBuffer: pixelBuffer).oriented(.right)
let context = CIContext(options: nil)
if let cgImage = context.createCGImage(ciImage, from: ciImage.extent) {
let imgWidth = Double(cgImage.width)
let imgHeight = Double(cgImage.height)
let left:Double = Double(scanRegion?["left"] ?? 0) / 100.0 * imgWidth
let top:Double = Double(scanRegion?["top"] ?? 0) / 100.0 * imgHeight
let width:Double = Double(scanRegion?["width"] ?? 100) / 100.0 * imgWidth
let height:Double = Double(scanRegion?["height"] ?? 100) / 100.0 * imgHeight
let cropRegion = CGRect(
x: left,
y: top,
width: width,
height: height
)
guard let croppedCGImage = cgImage.cropping(to: cropRegion) else {
return [:]
}
let uiImage = UIImage(cgImage: croppedCGImage)
image = VisionImage(image: uiImage)
print("using cropped image")
}
}else{
image = VisionImage(buffer: buffer)
image!.orientation = getOrientation(orientation: frame.orientation)
}
let result = try self.textRecognizer.results(in: image!)
let blocks = VisionCameraTextRecognition.processBlocks(blocks: result.blocks)
data["resultText"] = result.text
data["blocks"] = blocks
Expand Down
2 changes: 1 addition & 1 deletion src/Camera.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ export const Camera = forwardRef(function Camera(
const frameProcessor: ReadonlyFrameProcessor = useFrameProcessor(
(frame: Frame) => {
'worklet';
const data: Text[] | string = plugin(frame);
const data: Text | string = plugin(frame);
// @ts-ignore
useWorklets(data);
},
Expand Down
4 changes: 2 additions & 2 deletions src/scanText.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ export function createTextRecognitionPlugin(
throw new Error(LINKING_ERROR);
}
return {
scanText: (frame: Frame): Text[] => {
scanText: (frame: Frame): Text => {
'worklet';
// @ts-ignore
return plugin.call(frame) as Text[];
return plugin.call(frame) as Text;
},
};
}
Loading