  • Can't find error in function for changing sampling rate [closed]

    28 avril, par kitty uwu

    I have function for changing sampling rate of audio (only one channel):

    int change_sampling_rate(float *audio_input, int input_sample_rate, int output_sample_rate, int input_num_of_samples, float **audio_output, int *result_num_of_samples) {
        AVChannelLayout src_ch_layout = AV_CHANNEL_LAYOUT_MONO;
        AVChannelLayout dst_ch_layout = AV_CHANNEL_LAYOUT_MONO;
        struct SwrContext *swr_ctx;
        swr_ctx = swr_alloc();
        int ret;
        if (!swr_ctx) {
            fprintf(stderr, "Could not allocate resampler context\n");
            ret = AVERROR(ENOMEM);
        av_opt_set_chlayout(swr_ctx, "in_chlayout",    &src_ch_layout, 0);
        av_opt_set_int(swr_ctx, "in_sample_rate",       input_sample_rate, 0);
        av_opt_set_sample_fmt(swr_ctx, "in_sample_fmt", AV_SAMPLE_FMT_FLT, 0);
        av_opt_set_chlayout(swr_ctx, "out_chlayout",    &dst_ch_layout, 0);
        av_opt_set_int(swr_ctx, "out_sample_rate",       output_sample_rate, 0);
        av_opt_set_sample_fmt(swr_ctx, "out_sample_fmt", AV_SAMPLE_FMT_FLT, 0);
        if ((ret = swr_init(swr_ctx)) < 0) {
            fprintf(stderr, "Failed to initialize the resampling context\n");
            return -1;
        int output_samples_count = av_rescale_rnd(swr_get_delay(swr_ctx, input_sample_rate) + input_num_of_samples, output_sample_rate, input_sample_rate, AV_ROUND_UP);
        uint8_t **resampled_data = NULL;
        if (av_samples_alloc_array_and_samples(&resampled_data, NULL, 1, output_samples_count, AV_SAMPLE_FMT_FLT, 0) < 0) {
            fprintf(stderr, "Could not allocate resampled data\n");
            return -1;
        const uint8_t *in_samples[1] = {(const uint8_t *)audio_input};
        int frame_count = swr_convert(swr_ctx, resampled_data, output_samples_count, in_samples, input_num_of_samples);
        if (frame_count < 0) {
            fprintf(stderr, "Error while resampling\n");
            return -1;
        *audio_output = (float *) malloc(frame_count * sizeof(float));
        if (!*audio_output) {
            fprintf(stderr, "Could not allocate memory for output\n");
            return -1;
        memcpy(*audio_output, resampled_data[0], frame_count * sizeof(float));
        *result_num_of_samples = frame_count;
        return SUCCESS;

    When I run tests on time lag between two files (mp3) with different sampling rates, it gives answer that differs on about 15-20 ms with right answer. Can anybody, please, help me find mistakes in the code?

    For example, I have two audios: [audio_1] ( and [audio_2] ( - second audio is just a sample of first. The answer should be 35264 ms, but my function gives 35249 ms :(

  • How to yt-dlp extract youtube audio-only to 32-bit float 48000 .wav ? [closed]

    28 avril, par Rowe Morehouse

    My use case: Extract just the audio from a youtube URL directly to a .wav at 32-bit float 48000.

    Preferably without any post process args or secondary passes or after-the-fact conversion or muxing.

    I want f32le, aka PCM_f32le, aka PCM 32-bit floating-point little-endian, which is supported by ffmpeg. Also want 48000 sample rate, as stated.

    Is this possible?

    My current command:

    yt-dlp -f bestaudio --extract-audio --audio-format wav --audio-quality 0

    What do I need to add to achieve my use case / job-to-be-done??

  • Dynamic ffmpeg crop, scale & encoding code seems to break when the crop size changes

    28 avril, par Blindy

    The following code works perfectly as long as I only move the crop rectangle, however as soon as I change its size I no longer get frames out of my filter (av_buffersink_get_frame returns -11). It's crazy, even after the size changes, if it eventually changes to the original size that frame will go through, then it will go back to no longer providing frames.

    Would anyone happen to know what I'm doing wrong?

    My filter setup (note the crop & scale combination, it should (I think?) scale whatever I crop to the output video size):

    // buffer source -> buffer sink setup
    auto args = std::format("video_size={}x{}:pix_fmt={}:time_base={}/{}:pixel_aspect={}/{}",
        inputCodecContext->width, inputCodecContext->height, (int)inputCodecContext->pix_fmt,
        inputCodecContext->pkt_timebase.num, inputCodecContext->pkt_timebase.den,
        inputCodecContext->sample_aspect_ratio.num, inputCodecContext->sample_aspect_ratio.den);
    AVFilterContext* buffersrc_ctx = nullptr, * buffersink_ctx = nullptr;
    check_av_result(avfilter_graph_create_filter(&buffersrc_ctx, bufferSource, "in",
        args.c_str(), nullptr, &*filterGraph));
    check_av_result(avfilter_graph_create_filter(&buffersink_ctx, bufferSink, "out",
        nullptr, nullptr, &*filterGraph));
    check_av_result(av_opt_set_bin(buffersink_ctx, "pix_fmts",
        (uint8_t*)&outputCodecContext->pix_fmt, sizeof(outputCodecContext->pix_fmt), AV_OPT_SEARCH_CHILDREN));
    // filter command setup
    auto filterSpec = std::format("crop,scale={}:{},setsar=1:1", outputCodecContext->width, outputCodecContext->height);
    check_av_result(avfilter_graph_parse_ptr(&*filterGraph, filterSpec.c_str(), &filterInputs, &filterOutputs, nullptr));
    check_av_result(avfilter_graph_config(&*filterGraph, nullptr));

    Frame cropping:

    check_av_result(avfilter_graph_send_command(&*filterGraph, "crop", "x", std::to_string(cropRectangle.CenterX() - cropRectangle.Width() / 2).c_str(), nullptr, 0, 0));
    check_av_result(avfilter_graph_send_command(&*filterGraph, "crop", "y", std::to_string(cropRectangle.CenterY() - cropRectangle.Height() / 2).c_str(), nullptr, 0, 0));
    check_av_result(avfilter_graph_send_command(&*filterGraph, "crop", "w", std::to_string(cropRectangle.Width()).c_str(), nullptr, 0, 0));
    check_av_result(avfilter_graph_send_command(&*filterGraph, "crop", "h", std::to_string(cropRectangle.Height()).c_str(), nullptr, 0, 0));
    // push the decoded frame into the filter graph
    check_av_result(av_buffersrc_add_frame_flags(buffersrc_ctx, &*inputFrame, 0));
    // pull filtered frames from the filter graph
    while (1)
        ret = av_buffersink_get_frame(buffersink_ctx, &*filteredFrame);
        if (ret < 0)
            // if no more frames, rewrite the code to 0 to show it as normal completion
            if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF)
                ret = 0;
        // write the filtered frame to the output file 
        // [...]

    I also set the output video size before creating the file, and it is obeyed as expected:

    outputCodecContext->width = (int)output.PixelSize().Width;
    outputCodecContext->height = (int)output.PixelSize().Height;
  • adb screenrecord display only screenshot, it does not stream the screen [closed]

    28 avril, par hexols

    I have an Android TV, I want to stream its screen in my Ubuntu PC. I used this command:

    adb shell screenrecord --output-format=h264 - | ffplay -

    and after waiting for a while it displays the screenshot of the TV. But I want to display live stream of the Android TV. I also used the following command as well but got the same result:

    adb exec-out screenrecord --bit-rate=16m --output-format=h264 --size 800x600 - | ffplay -framerate 60 -framedrop -bufsize 16M -

    How can I achieve this using this command? Or is there a way to achieve it with another way by using VLC/Gstreamer/FFMPEG except using scrcpy/vysor?

  • How to Convert 16:9 Video to 9:16 Ratio While Ensuring Speaker Presence in Frame ?

    28 avril, par shreesha

    I am tried so many time to figure out the problem in detecting the face and also it's not so smooth enough to like other tools out there.

    So basically I am using python and Yolo in this project but I want the person who is talking and who the ROI (region of interest) is.

    Here is the code:

    from ultralytics import YOLO
    from ultralytics.engine.results import Results
    from moviepy.editor import VideoFileClip, concatenate_videoclips
    from import crop
    # Load the YOLOv8 model
    model = YOLO("")
    # Load the input video
    clip = VideoFileClip("short_test.mp4")
    tacked_clips = []
    for frame_no, frame in enumerate(clip.iter_frames()):
        # Process the frame
        results: list[Results] = model(frame)
        # Get the bounding box of the main object
        if results[0].boxes:
            objects = results[0].boxes
            main_obj = max(
                objects, key=lambda x: x.conf
            )  # Assuming the first detected object is the main one
            x1, y1, x2, y2 = [int(val) for val in main_obj.xyxy[0].tolist()]
            # Calculate the crop region based on the object's position and the target aspect ratio
            w, h = clip.size
            new_w = int(h * 9 / 16)
            new_h = h
            x_center = x2 - x1
            y_center = y2 - y1
            # Adjust x_center and y_center if they would cause the crop region to exceed the bounds
            if x_center + (new_w / 2) > w:
                x_center -= x_center + (new_w / 2) - w
            elif x_center - (new_w / 2) < 0:
                x_center += abs(x_center - (new_w / 2))
            if y_center + (new_h / 2) > h:
                y_center -= y_center + (new_h / 2) - h
            elif y_center - (new_h / 2) < 0:
                y_center += abs(y_center - (new_h / 2))
            # Create a subclip for the current frame
            start_time = frame_no / clip.fps
            end_time = (frame_no + 1) / clip.fps
            subclip = clip.subclip(start_time, end_time)
            # Apply cropping using MoviePy
            cropped_clip = crop(
                subclip, x_center=x_center, y_center=y_center, width=new_w, height=new_h
    reframed_clip = concatenate_videoclips(tacked_clips, method="compose")

    So basically I want to fix the face detection with ROI detection where it can detect the face and make that face and the body on to the frame and making sure that the speaker who is speaking is brought to the frame