#import <AVFoundation/AVFoundation.h>
#import <Vision/Vision.h>
#import <CoreImage/CoreImage.h>
#import <ImageIO/ImageIO.h>
#import <Foundation/Foundation.h>
typedef struct {
const char *unique_id;
const char *localized_name;
int position;
int is_default;
} CDeviceInfo;
typedef struct {
void *jpeg_data;
size_t jpeg_len;
uint32_t width;
uint32_t height;
const char *error_msg;
} CFrameResult;
typedef struct {
const char *gesture_name;
float confidence;
int hand_code;
} CGestureItem;
typedef struct {
CGestureItem *items;
size_t count;
const char *error_msg;
} CGestureList;
static int position_code(AVCaptureDevicePosition pos) {
switch (pos) {
case AVCaptureDevicePositionFront: return 1;
case AVCaptureDevicePositionBack: return 2;
default: return 3;
}
}
int av_camera_authorization_status(void) {
return (int)[AVCaptureDevice authorizationStatusForMediaType:AVMediaTypeVideo];
}
int av_request_camera_access(void) {
AVAuthorizationStatus status = [AVCaptureDevice authorizationStatusForMediaType:AVMediaTypeVideo];
if (status == AVAuthorizationStatusAuthorized) return 1;
if (status != AVAuthorizationStatusNotDetermined) return 0;
__block BOOL granted = NO;
dispatch_semaphore_t sema = dispatch_semaphore_create(0);
[AVCaptureDevice requestAccessForMediaType:AVMediaTypeVideo completionHandler:^(BOOL g) {
granted = g;
dispatch_semaphore_signal(sema);
}];
dispatch_semaphore_wait(sema, dispatch_time(DISPATCH_TIME_NOW, 30 * NSEC_PER_SEC));
return granted ? 1 : 0;
}
CDeviceInfo *av_list_cameras(size_t *out_count) {
NSArray<AVCaptureDevice *> *devices;
if (@available(macOS 10.15, *)) {
AVCaptureDeviceDiscoverySession *session =
[AVCaptureDeviceDiscoverySession
discoverySessionWithDeviceTypes:@[
AVCaptureDeviceTypeBuiltInWideAngleCamera,
AVCaptureDeviceTypeExternalUnknown
]
mediaType:AVMediaTypeVideo
position:AVCaptureDevicePositionUnspecified];
devices = session.devices;
} else {
devices = [AVCaptureDevice devicesWithMediaType:AVMediaTypeVideo];
}
*out_count = (size_t)devices.count;
if (*out_count == 0) return NULL;
CDeviceInfo *list = calloc(*out_count, sizeof(CDeviceInfo));
if (!list) { *out_count = 0; return NULL; }
AVCaptureDevice *defaultDevice = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
for (NSUInteger i = 0; i < devices.count; i++) {
AVCaptureDevice *dev = devices[i];
list[i].unique_id = strdup(dev.uniqueID.UTF8String);
list[i].localized_name = strdup(dev.localizedName.UTF8String);
list[i].position = position_code(dev.position);
list[i].is_default = [dev.uniqueID isEqualToString:defaultDevice.uniqueID] ? 1 : 0;
}
return list;
}
void av_free_camera_list(CDeviceInfo *list, size_t count) {
if (!list) return;
for (size_t i = 0; i < count; i++) {
free((void *)list[i].unique_id);
free((void *)list[i].localized_name);
}
free(list);
}
@interface CameraFrameDelegate : NSObject <AVCaptureVideoDataOutputSampleBufferDelegate>
@property (nonatomic, strong) dispatch_semaphore_t semaphore;
@property (nonatomic, assign) CMSampleBufferRef capturedBuffer;
@end
@implementation CameraFrameDelegate
- (void)captureOutput:(AVCaptureOutput *)output
didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
fromConnection:(AVCaptureConnection *)connection {
if (self.capturedBuffer) return;
CFRetain(sampleBuffer);
self.capturedBuffer = sampleBuffer;
dispatch_semaphore_signal(self.semaphore);
}
@end
static NSData *encode_pixel_buffer_as_jpeg(CVPixelBufferRef pixelBuffer,
uint32_t *out_width,
uint32_t *out_height) {
CIImage *ciImage = [CIImage imageWithCVPixelBuffer:pixelBuffer];
if (!ciImage) return nil;
*out_width = (uint32_t)CVPixelBufferGetWidth(pixelBuffer);
*out_height = (uint32_t)CVPixelBufferGetHeight(pixelBuffer);
NSDictionary *options = @{kCIContextUseSoftwareRenderer: @NO};
CIContext *ctx = [CIContext contextWithOptions:options];
NSMutableData *data = [NSMutableData data];
CGImageDestinationRef dest = CGImageDestinationCreateWithData(
(CFMutableDataRef)data,
kUTTypeJPEG,
1,
NULL);
if (!dest) return nil;
CGImageRef cgImage = [ctx createCGImage:ciImage fromRect:ciImage.extent];
if (!cgImage) { CFRelease(dest); return nil; }
NSDictionary *props = @{(NSString *)kCGImageDestinationLossyCompressionQuality: @0.90};
CGImageDestinationAddImage(dest, cgImage, (CFDictionaryRef)props);
bool ok = CGImageDestinationFinalize(dest);
CGImageRelease(cgImage);
CFRelease(dest);
return ok ? data : nil;
}
bool av_capture_frame(const char *device_id_cstr, CFrameResult *result) {
@autoreleasepool {
AVCaptureDevice *device = nil;
if (device_id_cstr) {
NSString *wantedId = [NSString stringWithUTF8String:device_id_cstr];
device = [AVCaptureDevice deviceWithUniqueID:wantedId];
}
if (!device) {
device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
}
if (!device) {
result->error_msg = strdup("No video capture device available");
return false;
}
AVCaptureSession *session = [[AVCaptureSession alloc] init];
session.sessionPreset = AVCaptureSessionPreset1280x720;
NSError *error = nil;
AVCaptureDeviceInput *input =
[AVCaptureDeviceInput deviceInputWithDevice:device error:&error];
if (!input || error) {
result->error_msg = strdup(error.localizedDescription.UTF8String ?: "Input init failed");
return false;
}
if (![session canAddInput:input]) {
result->error_msg = strdup("Cannot add device input to session");
return false;
}
[session addInput:input];
AVCaptureVideoDataOutput *output = [[AVCaptureVideoDataOutput alloc] init];
output.videoSettings = @{
(NSString *)kCVPixelBufferPixelFormatTypeKey:
@(kCVPixelFormatType_32BGRA)
};
output.alwaysDiscardsLateVideoFrames = YES;
dispatch_queue_t queue =
dispatch_queue_create("ax.camera.capture", DISPATCH_QUEUE_SERIAL);
CameraFrameDelegate *delegate = [[CameraFrameDelegate alloc] init];
delegate.semaphore = dispatch_semaphore_create(0);
[output setSampleBufferDelegate:delegate queue:queue];
if (![session canAddOutput:output]) {
result->error_msg = strdup("Cannot add video output to session");
return false;
}
[session addOutput:output];
[session startRunning];
dispatch_time_t timeout = dispatch_time(DISPATCH_TIME_NOW, 5LL * NSEC_PER_SEC);
long waited = dispatch_semaphore_wait(delegate.semaphore, timeout);
[session stopRunning];
[output setSampleBufferDelegate:nil queue:nil];
if (waited != 0 || !delegate.capturedBuffer) {
result->error_msg = strdup("Timed out waiting for camera frame");
return false;
}
CVPixelBufferRef pixelBuffer =
CMSampleBufferGetImageBuffer(delegate.capturedBuffer);
uint32_t width = 0, height = 0;
NSData *jpeg = encode_pixel_buffer_as_jpeg(pixelBuffer, &width, &height);
CFRelease(delegate.capturedBuffer);
delegate.capturedBuffer = nil;
if (!jpeg || jpeg.length == 0) {
result->error_msg = strdup("Failed to encode frame as JPEG");
return false;
}
void *buf = malloc(jpeg.length);
if (!buf) {
result->error_msg = strdup("Out of memory");
return false;
}
memcpy(buf, jpeg.bytes, jpeg.length);
result->jpeg_data = buf;
result->jpeg_len = jpeg.length;
result->width = width;
result->height = height;
result->error_msg = NULL;
return true;
}
}
void av_free_frame_result(CFrameResult *result) {
if (!result) return;
free(result->jpeg_data);
free((void *)result->error_msg);
result->jpeg_data = NULL;
result->error_msg = NULL;
}
static const float kMinJointConf = 0.15f;
static const float kFingerUpThresh = 0.08f;
static const float kThumbUpThresh = 0.10f;
static const float kThumbDownThresh = 0.10f;
static const char *classify_hand_pose(VNHumanHandPoseObservation *obs, float *confidence) API_AVAILABLE(macos(11.0)) {
NSError *err = nil;
VNRecognizedPoint *wrist =
[obs recognizedPointForJointName:VNHumanHandPoseObservationJointNameWrist error:&err];
VNRecognizedPoint *thumbTip =
[obs recognizedPointForJointName:VNHumanHandPoseObservationJointNameThumbTip error:&err];
VNRecognizedPoint *indexTip =
[obs recognizedPointForJointName:VNHumanHandPoseObservationJointNameIndexTip error:&err];
VNRecognizedPoint *middleTip =
[obs recognizedPointForJointName:VNHumanHandPoseObservationJointNameMiddleTip error:&err];
VNRecognizedPoint *ringTip =
[obs recognizedPointForJointName:VNHumanHandPoseObservationJointNameRingTip error:&err];
VNRecognizedPoint *littleTip =
[obs recognizedPointForJointName:VNHumanHandPoseObservationJointNameLittleTip error:&err];
VNRecognizedPoint *thumbIP =
[obs recognizedPointForJointName:VNHumanHandPoseObservationJointNameThumbIP error:&err];
VNRecognizedPoint *indexMCP =
[obs recognizedPointForJointName:VNHumanHandPoseObservationJointNameIndexMCP error:&err];
VNRecognizedPoint *middleMCP =
[obs recognizedPointForJointName:VNHumanHandPoseObservationJointNameMiddleMCP error:&err];
VNRecognizedPoint *ringMCP =
[obs recognizedPointForJointName:VNHumanHandPoseObservationJointNameRingMCP error:&err];
VNRecognizedPoint *littleMCP =
[obs recognizedPointForJointName:VNHumanHandPoseObservationJointNameLittleMCP error:&err];
if (!wrist || wrist.confidence < kMinJointConf) {
*confidence = 0.0f;
return NULL;
}
float wy = (float)wrist.location.y;
float wx = (float)wrist.location.x;
NSLog(@"[axterminator] hand-pose wrist=(%.3f,%.3f,conf=%.2f) "
@"thumbTip=(%.3f,%.3f,conf=%.2f) indexTip=(%.3f,%.3f,conf=%.2f) "
@"middleTip=(%.3f,%.3f,conf=%.2f) ringTip=(%.3f,%.3f,conf=%.2f) "
@"littleTip=(%.3f,%.3f,conf=%.2f) obs.confidence=%.2f",
wx, wy, (float)wrist.confidence,
thumbTip ? (float)thumbTip.location.x : -1.f,
thumbTip ? (float)thumbTip.location.y : -1.f,
thumbTip ? (float)thumbTip.confidence : -1.f,
indexTip ? (float)indexTip.location.x : -1.f,
indexTip ? (float)indexTip.location.y : -1.f,
indexTip ? (float)indexTip.confidence : -1.f,
middleTip ? (float)middleTip.location.x : -1.f,
middleTip ? (float)middleTip.location.y : -1.f,
middleTip ? (float)middleTip.confidence : -1.f,
ringTip ? (float)ringTip.location.x : -1.f,
ringTip ? (float)ringTip.location.y : -1.f,
ringTip ? (float)ringTip.confidence : -1.f,
littleTip ? (float)littleTip.location.x : -1.f,
littleTip ? (float)littleTip.location.y : -1.f,
littleTip ? (float)littleTip.confidence : -1.f,
(float)obs.confidence);
#define JOINT_OK(pt) ((pt) && (float)(pt).confidence >= kMinJointConf)
#define TIP_ABOVE_MCP(tip, mcp) \
(JOINT_OK(tip) && JOINT_OK(mcp) \
? (float)(tip).location.y > (float)(mcp).location.y + kFingerUpThresh \
: (JOINT_OK(tip) && (float)(tip).location.y > wy + kFingerUpThresh))
bool thumbUp = JOINT_OK(thumbTip) && (float)thumbTip.location.y > wy + kThumbUpThresh;
bool thumbDown = JOINT_OK(thumbTip) && (float)thumbTip.location.y < wy - kThumbDownThresh;
bool indexUp = TIP_ABOVE_MCP(indexTip, indexMCP);
bool middleUp = TIP_ABOVE_MCP(middleTip, middleMCP);
bool ringUp = TIP_ABOVE_MCP(ringTip, ringMCP);
bool littleUp = TIP_ABOVE_MCP(littleTip, littleMCP);
NSLog(@"[axterminator] gesture flags thumbUp=%d thumbDown=%d indexUp=%d "
@"middleUp=%d ringUp=%d littleUp=%d",
thumbUp, thumbDown, indexUp, middleUp, ringUp, littleUp);
bool allFingersDown = !indexUp && !middleUp && !ringUp && !littleUp;
if (thumbUp && indexUp && middleUp && ringUp && littleUp) {
*confidence = (float)MIN(obs.confidence, 0.95);
return "stop";
}
if (thumbUp && allFingersDown) {
*confidence = (float)MIN(thumbTip.confidence, 0.90);
return "thumbs_up";
}
if (thumbDown && allFingersDown) {
bool ipDown = thumbIP && (float)thumbIP.location.y < wy - 0.10f;
*confidence = ipDown ? (float)MIN(thumbTip.confidence, 0.85) : 0.60f;
return "thumbs_down";
}
if (indexUp && !middleUp && !ringUp && !littleUp) {
*confidence = (float)MIN(indexTip.confidence, 0.88);
return "point";
}
if (indexUp && middleUp && !ringUp && !littleUp) {
*confidence = 0.80f;
return "wave";
}
return NULL;
}
bool vn_detect_gestures(const uint8_t *jpeg_data,
size_t jpeg_len,
CGestureList *list) {
@autoreleasepool {
list->items = NULL;
list->count = 0;
list->error_msg = NULL;
NSData *data = [NSData dataWithBytesNoCopy:(void *)jpeg_data
length:jpeg_len
freeWhenDone:NO];
CIImage *ciImage = [CIImage imageWithData:data];
if (!ciImage) {
list->error_msg = strdup("Cannot decode JPEG for Vision processing");
return false;
}
CGestureItem *items = calloc(2, sizeof(CGestureItem));
if (!items) {
list->error_msg = strdup("Out of memory");
return false;
}
size_t found = 0;
if (@available(macOS 11.0, *)) {
VNDetectHumanHandPoseRequest *handReq =
[[VNDetectHumanHandPoseRequest alloc] init];
handReq.maximumHandCount = 2;
VNImageRequestHandler *handler =
[[VNImageRequestHandler alloc] initWithCIImage:ciImage options:@{}];
NSError *err = nil;
[handler performRequests:@[handReq] error:&err];
NSArray<VNHumanHandPoseObservation *> *observations = handReq.results;
for (VNHumanHandPoseObservation *obs in observations) {
if (found >= 2) break;
float confidence = 0.0f;
const char *name = classify_hand_pose(obs, &confidence);
if (!name || confidence < 0.5f) continue;
items[found].gesture_name = name;
items[found].confidence = confidence;
if (@available(macOS 12.0, *)) {
switch (obs.chirality) {
case VNChiralityLeft: items[found].hand_code = 0; break;
case VNChiralityRight: items[found].hand_code = 1; break;
default: items[found].hand_code = 3; break;
}
} else {
items[found].hand_code = 3;
}
found++;
}
VNDetectFaceLandmarksRequest *faceReq =
[[VNDetectFaceLandmarksRequest alloc] init];
VNImageRequestHandler *faceHandler =
[[VNImageRequestHandler alloc] initWithCIImage:ciImage options:@{}];
NSError *faceErr = nil;
[faceHandler performRequests:@[faceReq] error:&faceErr];
if (@available(macOS 12.0, *)) {
for (VNFaceObservation *face in faceReq.results) {
if (found >= 2) break;
if (!face.pitch || !face.yaw) continue;
float pitch = (float)face.pitch.doubleValue;
float yaw = (float)face.yaw.doubleValue;
if (fabsf(pitch) > 0.35f && fabsf(pitch) > fabsf(yaw)) {
items[found].gesture_name = "nod";
items[found].confidence = MIN(0.75f + fabsf(pitch) * 0.5f, 0.95f);
items[found].hand_code = 2;
found++;
} else if (fabsf(yaw) > 0.35f && fabsf(yaw) > fabsf(pitch)) {
items[found].gesture_name = "shake";
items[found].confidence = MIN(0.75f + fabsf(yaw) * 0.5f, 0.95f);
items[found].hand_code = 2;
found++;
}
}
}
} else {
free(items);
list->error_msg = strdup("VNDetectHumanHandPoseRequest requires macOS 11.0 or later");
return false;
}
if (found == 0) {
free(items);
list->items = NULL;
} else {
list->items = items;
}
list->count = found;
return true;
}
}
void vn_free_gesture_list(CGestureList *list) {
if (!list) return;
free(list->items);
free((void *)list->error_msg);
list->items = NULL;
list->error_msg = NULL;
list->count = 0;
}