-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrecognition.py
149 lines (128 loc) · 4.67 KB
/
recognition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import cv2
import mediapipe as mp
import numpy as np
import tensorflow as tf
from tensorflow import keras
mp_hands = mp.solutions.hands
IMAGE_SIZE = (100, 100)
model = keras.models.load_model('Hand_Gesture_Model.h5')
classes = [chr(num) for num in range(ord('A'), ord('Z')+1)] # List of all (capital) alphabets
classes.extend(("d", "n", "s")) # d -> delete; n -> nothing; s -> space
def process_image(image):
'''Processes the image to be fed into the model.
Args:
image: The image to be processed.
Returns:
The processed image.
'''
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
image = cv2.Canny(image, 80, 90) # Edge detection
image = cv2.dilate(image, None) # Dilation and erosion
image = cv2.erode(image, None) # clears up noise
image = cv2.resize(image, IMAGE_SIZE) # Resize to the trained image size
return image
def get_roi_coords(landmarks, padding=25):
'''Finds the square region-of-interest coordinates from the hand landmarks.
Args:
landmarks: The hand landmarks.
padding: The padding to be added to the bounding box before translation to roi.
Returns:
The roi coordinates.
'''
for hand_landmarks in results.multi_hand_landmarks:
x_values = [landmark.x for landmark in hand_landmarks.landmark]
y_values = [landmark.y for landmark in hand_landmarks.landmark]
x_min = min(x_values)
x_max = max(x_values)
y_min = min(y_values)
y_max = max(y_values)
# Scale to actual dimensions
x_min = int(x_min * width)
x_max = int(x_max * width)
y_min = int(y_min * height)
y_max = int(y_max * height)
# Apply padding while respecting bounds
# Minimum values should be decreased during padding, but >= 0
# Maximum values should be increased but <= image width/height
x_min = max(x_min - padding, 0)
x_max = min(x_max + padding, width)
y_min = max(y_min - padding, 0)
y_max = min(y_max + padding, height)
# Expand bounding box to square roi
difference = abs((x_max - x_min) - (y_max - y_min))
if x_max - x_min > y_max - y_min:
# Need to expand the height of the bounding box.
# Expand the bounding box equally on both sides.
y_min -= difference // 2
y_max += difference // 2
if y_min < 0:
# The expansion has caused the box to cross the top edge
# Shift bounding box downwards
delta = -y_min
y_min += delta
y_max += delta
elif y_max > height:
# Box has crossed bottom edge, shift bounding box upwards
delta = y_max-height
y_min -= delta
y_max -= delta
elif x_max - x_min < y_max - y_min:
# Need to expand the width of the bounding box.
x_min -= difference // 2
x_max += difference // 2
if x_min < 0:
delta = -x_min
x_min += delta
x_max += delta
elif x_max > width:
delta = x_max-width
x_min -= delta
x_max -= delta
return (x_min, y_min, x_max, y_max)
def predict_class(roi):
'''Predicts the class of the hand gesture in the image roi.
Args:
roi: The region-of-interest image.
Returns:
The class of the hand gesture.
'''
roi = np.expand_dims(roi, axis=0) # Add the channel dimension as is in the model
probabilities = model(roi).numpy()[0] # Convert tensor result to python object
prediction = classes[np.argmax(probabilities)]
return prediction, probabilities
cap = cv2.VideoCapture(0)
_, frame = cap.read()
height, width, channels = frame.shape
with mp_hands.Hands(
model_complexity=0, # Less latency with lower complexity
max_num_hands=1,
min_detection_confidence=0.5,
min_tracking_confidence=0.5) as hands:
while cap.isOpened():
success, image = cap.read()
if not success:
continue
# To improve performance, optionally mark the image as not writeable to
# pass by reference.
image.flags.writeable = False
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # cv2 operates in BGR, mediapipe in RGB
results = hands.process(image)
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
if results.multi_hand_landmarks:
# Process roi and predict
x_min, y_min, x_max, y_max = get_roi_coords(results.multi_hand_landmarks)
roi = image[y_min:y_max, x_min:x_max] # Get roi from image
roi = process_image(roi)
prediction, probabilities = predict_class(roi)
# Display prediction and roi
roi = cv2.cvtColor(roi, cv2.COLOR_GRAY2BGR)
image[y_min:y_max, x_min:x_max] = cv2.resize(roi, (x_max - x_min, y_max - y_min)) # Paste the roi back into the image
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 0, 0), 2)
image = cv2.flip(image, 1) # Flip *before* the text is applied
cv2.putText(image, prediction, (width-x_max, y_max), cv2.FONT_HERSHEY_DUPLEX, 0.5, (0, 255, 0), 1)
else:
image = cv2.flip(image, 1) # Flip for a selfie-view display
cv2.imshow('Hand Gesture Recognition', image)
if cv2.waitKey(5)== ord('q'):
break
cap.release()