diff --git a/Insights/SignTrack.h5 b/Insights/SignTrack.h5 index 6f4ca5b3e0cd4a1e8baaf6a362845ab3667b8513..ab857b7eaa1780546169c26c297e63f11919babb 100644 Binary files a/Insights/SignTrack.h5 and b/Insights/SignTrack.h5 differ diff --git a/model.SignTrack b/model.SignTrack index a158a36d79a704b8afd4abfe076ee7b7270313de..3e62216fe1b9cd8e7cab0f71e74b8b683b9cc49a 100644 Binary files a/model.SignTrack and b/model.SignTrack differ diff --git a/signtrack/SignTrack_DataColect.py b/signtrack/SignTrack_DataColect.py index a0049c57066d7f241c9f52565302b9d614039122..8b85d2c0b854d0faf83616907469ecf2ef34f04c 100644 --- a/signtrack/SignTrack_DataColect.py +++ b/signtrack/SignTrack_DataColect.py @@ -9,13 +9,13 @@ from pathlib import Path from essentials import mediapipe_detection, extract_keypoints, display_styled_landmarks # Dataset export location, Changing requires changes in Signtrack_Train -data_path = os.path.join('Dataset') +data_path = os.path.join('Test') # Actions that we try to detect, Changing requires changes in Signtrack_Train -signs = np.array(["sorry"]) +signs = np.array(["no"]) # Number of sequences to be collected for each action -no_datapacks = 25 +no_datapacks = 5 # Frames per sequence, Changing requires changes in Signtrack_Train seq_length = 24 diff --git a/signtrack/SignTrack_Train.py b/signtrack/SignTrack_Train.py index 38bbcb305f9654527dd3b4185608de1dc5a52186..653977ef897651776242d2dbfd00920c5f862af2 100644 --- a/signtrack/SignTrack_Train.py +++ b/signtrack/SignTrack_Train.py @@ -42,7 +42,7 @@ for sign in signs: window_aug.append(res) # Randomly duplicating images in a copy of res # Used for data augmentation - for i in range(round(sequence_length * 0.5)): + for i in range(round(sequence_length * 0.75)): rand = random.randint(1, sequence_length-1) window_aug[rand] = window_aug[rand-1] sequences.append(window) @@ -77,7 +77,7 @@ loss = 1 AutoTrain = model.fit(X_train, y_train, epochs=1) for i in range(250): - if AutoTrain.history['loss'][-1] >= 0.04: + if AutoTrain.history['loss'][-1] >= 0.05: AutoTrain = model.fit(X_train, y_train, epochs=1) if AutoTrain.history['loss'][-1] < loss: diff --git a/signtrack/__pycache__/Packr.cpython-38.pyc b/signtrack/__pycache__/Packr.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a9a8972ea79bd62f5ad0f961484c1093c435f9b4 Binary files /dev/null and b/signtrack/__pycache__/Packr.cpython-38.pyc differ diff --git a/signtrack/__pycache__/essentials.cpython-38.pyc b/signtrack/__pycache__/essentials.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c1b7d8c038c033b5db23cdb57c38fbab18fc6a8d Binary files /dev/null and b/signtrack/__pycache__/essentials.cpython-38.pyc differ diff --git a/signtrack/essentials.py b/signtrack/essentials.py index 4b900e6cf664595a7e6e4baf1132b734e5303cd7..4b78e0fb4d73784a6b36ee08a267c2df236e54ca 100644 --- a/signtrack/essentials.py +++ b/signtrack/essentials.py @@ -75,7 +75,7 @@ def HandsOnScene(results): def extract_keypoints(results): """ - This is utilized to convert the keypoint results to a flat numpy array, thats is easy to save and proccess + This is utilized to convert the keypoint results to a flat numpy array, that's easy to save and proccess """ lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten( ) if results.left_hand_landmarks else np.zeros(21*3) @@ -89,7 +89,7 @@ def extract_keypoints(results): def grammar_correct(sentence): """ Grammar in sign language often differs from the on in written speech. - Using this function, the sentence is corrected from simple grammatical + Using this function, the sentence is corrected from common grammatical errors """ for key in phrases: diff --git a/signtrack/main copy.py b/signtrack/main copy.py new file mode 100644 index 0000000000000000000000000000000000000000..57915c098ed23565a437f1f09b17432c832002ee --- /dev/null +++ b/signtrack/main copy.py @@ -0,0 +1,246 @@ +from re import I +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import LSTM, Dense +import cv2 +import cvzone +import shutil +import numpy as np +import mediapipe as mp +from essentials import mediapipe_detection, display_styled_landmarks, extract_keypoints, HandsOnScene, grammar_correct +import Packr +import random + + + +# The number of frames per sequence that the model has been trained on +seq_length = 24 + +# Choose camera input +cap = cv2.VideoCapture(1) + +# Resize camera input +cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640) +cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480) + +shutil.rmtree('tmp', True) # Erase previous temporary data + +# Unpacks the Model.Pack file and loads the features(signs) of the model +signs = np.load(Packr.ModelIDUnpack()) + +mp_holistic = mp.solutions.holistic # Holistic model +mp_drawing = mp.solutions.drawing_utils # Drawing utilities + +# Setting up the model archtecture +model = Sequential() +model.add(LSTM(64, return_sequences=True, + activation='relu', input_shape=(24, 258))) +model.add(LSTM(128, return_sequences=True, activation='relu')) +model.add(LSTM(64, return_sequences=False, activation='relu')) +model.add(Dense(64, activation='relu')) +model.add(Dense(32, activation='relu')) +model.add(Dense(signs.shape[0], activation='softmax')) + +# Loading the model +model.load_weights("tmp/Insights/SignTrack.h5") + +shutil.rmtree('tmp', True) # Erase temporary data + + +# Import image assets and resize them to fit the output frame +TopBar = cv2.imread("Assets/TopBar.png", cv2.IMREAD_UNCHANGED) +TopBar = cv2.resize(TopBar, (0, 0), None, 0.36, 0.42) + +BottomBar = cv2.imread("Assets/BottomBar.png", cv2.IMREAD_UNCHANGED) +BottomBar = cv2.resize(BottomBar, (0, 0), None, 0.36, 0.42) + + +def prob_vis(res, actions, input_frame): + ''' + It adds to the image a visualization + of the chances that an action apears in the + image + Returns the final image after the process + + ''' + output_frame = input_frame.copy() + resfin = {} + resfinshorted = {} + to_add = 0 + + # Create a dict including the possibility of each sign being in the frames as a percentage + for num, prob in enumerate(res): + resfin.update({round(prob*100, 2): actions[num]}) + + # Adding '--' for non existing values + to_add = 7 - len(resfin) + if 0 < to_add: + for i in range(to_add): + resfin.update({0.0001*i: '--'}) + + # Creating a shorted version of the dictionary with the most probable signs going first + for i in sorted(resfin, reverse=True): + resfinshorted[i] = resfin[i] + + # Initializing lists with the shorted signs and their probability of their presence + ResValShorted = list(resfinshorted.values()) + ResKeysShorted = list(resfinshorted.keys()) + + # Positioning the assets and the text on the image + """ + Adds the bottom bar on the frame, the position is calculated : + For the X axis: by calculating half of the width of the frame + minus half of the width of the bottom bar + For the Y axis: by calculating the height of the frame minus + the height of the bar while leavig 17 pixels space from the top + """ + + output_frame = cvzone.overlayPNG( + output_frame, BottomBar, [round(wc/2-wb/2), round(hc-hb-17)]) + + """ + Adds the top bar on the frame, the position is calculated : + For the X axis: by calculating half of the difference between + the width of the frame and the width of the bottom bar + For the Y axis: by calculating the height of the bar minus + the height of the bar devided by 1.5 + """ + output_frame = cvzone.overlayPNG( + output_frame, TopBar, [round((wc-wt)/2), round(ht-(ht/1.5))]) + + for i in range(6): + """ + Prints the 6 most probable signs on the top bar, + the position is calculated knowing that: + For the X axis: the distance between each window is + 105 pixels, while the first window is 52 pixels + from 0 and that each letter has aproximately a 5.25 + pixel width + The Y axis possition remains the same at 35 pixels + """ + cv2.putText(output_frame, str(ResValShorted[i]), ((round((52 + 105*(i)-len(ResValShorted[i])*5.25))), + 35), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1, cv2.LINE_AA) + for i in range(3): + """ + Prints the probabilities of the 3 most probable signs + on the top bar, the position is calculated knowing that: + For the X : the distance between each window is + 105 pixels, while the first window is 46 from 0axis + The Y axis possition remains the same at 53 pixels + """ + cv2.putText(output_frame, str(round(ResKeysShorted[i], 1)), ((round((46 + 105*(i)))), + 53), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0, 0, 0), 1, cv2.LINE_AA) + + """ + Adds the sentence text on the bottom bar, the position is calculated by: + For the X axis: dividing the width of the frame by 6 + For the Y axis: calculating the difference of the height of the + frame and the height of the bar while keeping a 3 pixel + distance from the top of the bar + """ + cv2.putText(output_frame, text.capitalize(), (round(wc/6), hc-hb+3), + cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 0), 1, cv2.LINE_AA) + + return output_frame + + +# Initializing empty values +text = '' +seq, sentence = [], [] +HandsOnPrevFrames = [False] +threshold = 0.90 +res = np.zeros(shape=signs.shape[0]) +# Set mediapipe model +holistic = mp_holistic.Holistic( + min_detection_confidence=0.5, min_tracking_confidence=0.5) + + +while cap.isOpened(): + + # Read feed + ret, frame = cap.read() + + # Make detections + img, results = mediapipe_detection(frame, holistic) + + # Loading the dimentions of cap and the assets + hb, wb, cb = BottomBar.shape + hc, wc, cc = img.shape + ht, wt, ct = TopBar.shape + # Draw landmarks + display_styled_landmarks(img, results) + + # 2. Prediction logic + + # Creates a history the Hands on Scene results + HandsOnPrevFrames.append(HandsOnScene(results)) + HandsOnPrevFrames = HandsOnPrevFrames[-44:] + + keypoints = extract_keypoints(results) + + # If the hands are in the frame append the leypoints in seq to call the model to make predictions later + if True in HandsOnPrevFrames[-5:]: + if HandsOnScene(results): + seq.append(keypoints) + seq = seq[-24:] + text = grammar_correct((' '.join(sentence))) + # Else if hands are not in the scene for the last 5 frames clear sequence data + else: + text = grammar_correct((' '.join(sentence))) + seq = [] + res = np.zeros(shape=len(signs)) + +# if the hands are not visible in teh last 44 frames clear the sentence and process the displayed text + if not True in HandsOnPrevFrames: + if len(sentence) > 0: + # Capitalizing the needed letters + text = grammar_correct(text) + text = text.capitalize() + sentence = [] + + # If there are 24 frames in seq then call the model to predict + if len(seq) == seq_length: + res = model.predict(np.expand_dims(seq, axis=0))[0] + + """ + In case there is more than 65% the amount of needed data + call the model to predict in a new version of seq + with randomly duplicated frames + """ + + elif len(seq) >= seq_length * 0.65: + missing = seq_length - len(seq) + seqpros = seq + for i in range(missing): + rand = random.randint(0, len(seq)-1) + seqpros.insert(rand, seq[rand]) + res = model.predict(np.expand_dims(seqpros, axis=0))[0] + seqpros = [] + res = np.zeros(shape=len(signs)) + + # 3. Viz logic + + # If the probabillity of the most probable sign is more than the thershold + if res[np.argmax(res)] > threshold: + # Checking whether it is different than the last prediction then append it in the sentence + if len(sentence) > 0: + if signs[np.argmax(res)] != sentence[-1]: + sentence.append(signs[np.argmax(res)]) + # If it is empty just add the prediction in the sentence + else: + sentence.append(signs[np.argmax(res)]) + + # Keep the last 6 phrases in sentence + sentence = sentence[-6:] + + # Viaualizing probabilities + img = prob_vis(res, signs, img) + + # Display the final image + cv2.imshow('SignTrack', img) + + # End properly + if cv2.waitKey(10) & 0xFF == ord('q'): + break +# Terminating the window +cap.release() +cv2.destroyAllWindows() diff --git a/signtrack/main.py b/signtrack/main.py index aa21b70fdce414a6a12767348beb16705d9c9ab0..072d17f9f6f8e29a6e9230393c2b96ec5c2eceb3 100644 --- a/signtrack/main.py +++ b/signtrack/main.py @@ -23,7 +23,7 @@ pun = pipeline('ner', model=model, tokenizer=tokenizer) seq_length = 24 # Choose camera input -cap = cv2.VideoCapture(3) +cap = cv2.VideoCapture(1) # Resize camera input cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640) @@ -196,7 +196,7 @@ while cap.isOpened(): seq = [] res = np.zeros(shape=len(signs)) -# if the hands are not visible in teh last 44 frames clearthe sentence and process the displayed text +# if the hands are not visible in teh last 44 frames clear the sentence and process the displayed text if not True in HandsOnPrevFrames: if len(sentence) > 0: sentence = text.capitalize() diff --git a/signtrack/model.pack b/signtrack/model.pack deleted file mode 100644 index 0c5d4cafdfced3a552cdeb91a76a339f775de85c..0000000000000000000000000000000000000000 Binary files a/signtrack/model.pack and /dev/null differ diff --git a/tests/Assets/Asset_col.png b/tests/Assets/Asset_col.png new file mode 100644 index 0000000000000000000000000000000000000000..ece46d3c6fbef1b8cc43f45f4573a5c3371aafab Binary files /dev/null and b/tests/Assets/Asset_col.png differ diff --git a/tests/Assets/Asset_ncol.png b/tests/Assets/Asset_ncol.png new file mode 100644 index 0000000000000000000000000000000000000000..ea943489f4414191efbd87264046b32ecd56091e Binary files /dev/null and b/tests/Assets/Asset_ncol.png differ diff --git a/tests/Assets/Bar.png b/tests/Assets/Bar.png new file mode 100644 index 0000000000000000000000000000000000000000..fb1f5ab415022ce13cafe7ee5408fb7cd04fd0e0 Binary files /dev/null and b/tests/Assets/Bar.png differ diff --git a/tests/Assets/BottomBar.png b/tests/Assets/BottomBar.png new file mode 100644 index 0000000000000000000000000000000000000000..4ad7e88957c23f31d542964c5ff08be5b75516ea Binary files /dev/null and b/tests/Assets/BottomBar.png differ diff --git a/tests/Assets/Circle.png b/tests/Assets/Circle.png new file mode 100644 index 0000000000000000000000000000000000000000..7aecf6990dffb927a6fc643492617504c6eb6050 Binary files /dev/null and b/tests/Assets/Circle.png differ diff --git a/tests/Assets/TopBar.png b/tests/Assets/TopBar.png new file mode 100644 index 0000000000000000000000000000000000000000..10d04d7afbda889f12552a2abf026fe7b762761b Binary files /dev/null and b/tests/Assets/TopBar.png differ diff --git a/tests/SignTrack.py b/tests/SignTrack Bolt.py similarity index 100% rename from tests/SignTrack.py rename to tests/SignTrack Bolt.py diff --git a/tests/SignTrack_DataColect.py b/tests/SignTrack_DataColect Bolt.py similarity index 76% rename from tests/SignTrack_DataColect.py rename to tests/SignTrack_DataColect Bolt.py index a0049c57066d7f241c9f52565302b9d614039122..c9c190c16d745cd6e7a48d8dd4667cf5e43f09b6 100644 --- a/tests/SignTrack_DataColect.py +++ b/tests/SignTrack_DataColect Bolt.py @@ -9,13 +9,13 @@ from pathlib import Path from essentials import mediapipe_detection, extract_keypoints, display_styled_landmarks # Dataset export location, Changing requires changes in Signtrack_Train -data_path = os.path.join('Dataset') +data_path = os.path.join('test') # Actions that we try to detect, Changing requires changes in Signtrack_Train signs = np.array(["sorry"]) # Number of sequences to be collected for each action -no_datapacks = 25 +no_datapacks = 3 # Frames per sequence, Changing requires changes in Signtrack_Train seq_length = 24 @@ -118,78 +118,94 @@ def Graphics(img, sign, sequence, collecting): ''' Mediapipe uses an LSTM model, just like SignTrack, that means that the results are made based on a sequence of data. Thus when trying to make predictions on the flipped image it is -important to utilize a different version of the Mediapipe model, to avoit the model's confusion. +important to utilize a different version of the Mediapipe model, to avoid the model's confusion. ''' -holisticf = mp_holistic.Holistic( - min_detection_confidence=0.5, min_tracking_confidence=0.5) holistic = mp_holistic.Holistic( min_detection_confidence=0.5, min_tracking_confidence=0.5) +holisticf = mp_holistic.Holistic( + min_detection_confidence=0.5, min_tracking_confidence=0.5) -def DataAugm(frame): - ''' - Gets as input a frame and outputs the keypoints data of the flipped image - ''' - frame_fliped = cv2.flip(frame, 1) - img_flipped, results_flipped = mediapipe_detection( - frame_fliped, holisticf) - keypoints_flipped = extract_keypoints(results_flipped) - return keypoints_flipped + +# Loading the dimentions of cap and the assets and image +hf, wf, cf = AssetCol.shape +hc, wc, cc = AssetCircle.shape +hbar, wbar, cbar = AssetBar.shape +hb, wb, cb = (480, 640, 3) + +pics = [] + +def ProcessImg(pics,seq): + framenum=0 + for frame in pics: + img, results = mediapipe_detection(frame, holistic) + keypoints = extract_keypoints(results) + npy_path = os.path.join( + data_path, sign, str((2 * seq) + exdt), str(framenum)) + np.save(npy_path, keypoints) + + # Save the landmarks of the flipped image as an numpy array + frame_fliped = cv2.flip(frame, 1) + + img_flipped, results_flipped = mediapipe_detection( + frame_fliped, holisticf) + + keypoints_flipped = extract_keypoints(results_flipped) + npy_path_flipped = os.path.join( + data_path, sign, str((2 * seq + 1) + exdt), str(framenum)) + np.save(npy_path_flipped,keypoints_flipped) + framenum+=1 + pics = [] # Loop through each sign +img=np.zeros((480, 640, 3), np.uint8) for sign in signs: exdt = existing_data(sign) - (no_datapacks * 2) # Loop through sequences aka videos for seq in range(no_datapacks): + # Loop through video length aka sequence length for frame_num in range(seq_length): # Read camera feed ret, frame = cap.read() + pics.append(frame) # Detect hand and pose landmarks img, results = mediapipe_detection(frame, holistic) # Loading the dimentions of cap and the assets - hf, wf, cf = AssetCol.shape - hb, wb, cb = img.shape - hc, wc, cc = AssetCircle.shape - hbar, wbar, cbar = AssetBar.shape # Draw landmarks display_styled_landmarks(img, results) - ''' - The wait logic, - If the collected sequence is on its first frame, - changes the apearence of the image accordingly. - ''' - if frame_num == 0: - cv2.imshow('SignTrack Data Collect', - Graphics(img, sign, seq, False)) - cv2.waitKey(700) - else: - cv2.imshow('SignTrack Data Collect', - Graphics(img, sign, seq, True)) + cv2.imshow('SignTrack Data Collect', + Graphics(img, sign, seq, False)) + cv2.imshow('SignTrack Data Collect', + Graphics(img, sign, seq, True)) + # Export keypoints keypoints = extract_keypoints(results) - # Save the landmarks as an numpy array - npy_path = os.path.join( - data_path, sign, str((2 * seq) + exdt), str(frame_num)) - np.save(npy_path, keypoints) - - # Save the landmarks of the flipped image as an numpy array - npy_path_flipped = os.path.join( - data_path, sign, str((2 * seq + 1) + exdt), str(frame_num)) - np.save(npy_path_flipped, DataAugm(frame)) - if cv2.waitKey(10) & 0xFF == ord('q'): break + + ''' + The wait logic, + If the collected sequence is on its first frame, + changes the apearence of the image accordingly. + ''' + + cv2.imshow('SignTrack Data Collect', + Graphics(img, sign, seq, False)) + cv2.waitKey(1000) + if pics: + ProcessImg(pics,seq) + pics.clear() cap.release() cv2.destroyAllWindows() diff --git a/tests/__pycache__/Packr.cpython-38.pyc b/tests/__pycache__/Packr.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3177971153458590091a8baf1dcb994c27422c0c Binary files /dev/null and b/tests/__pycache__/Packr.cpython-38.pyc differ diff --git a/tests/__pycache__/essentials.cpython-38.pyc b/tests/__pycache__/essentials.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f376cce13f161a94643f4d039a8fe48770e54bd Binary files /dev/null and b/tests/__pycache__/essentials.cpython-38.pyc differ