Improve part metadata in stream (#58822)

2021-11-01 11:23:01 +08:00 · 2021-11-01 11:23:01 +08:00 · 9aaa92f366
commit 9aaa92f366
parent 7126c9b0de
2 changed files with 106 additions and 63 deletions
--- a/homeassistant/components/stream/worker.py
+++ b/homeassistant/components/stream/worker.py
@ -66,9 +66,15 @@ class SegmentBuffer:
        memory_file: BytesIO,
        sequence: int,
        input_vstream: av.video.VideoStream,
-    ) -> av.container.OutputContainer:
-        """Make a new av OutputContainer."""
-        return av.open(
+        input_astream: av.audio.stream.AudioStream,
+    ) -> tuple[
+        av.container.OutputContainer,
+        av.video.VideoStream,
+        av.audio.stream.AudioStream | None,
+    ]:
+        """Make a new av OutputContainer and add output streams."""
+        add_audio = input_astream and input_astream.name in AUDIO_CODECS
+        container = av.open(
            memory_file,
            mode="w",
            format=SEGMENT_CONTAINER_FORMAT,
@ -93,19 +99,21 @@ class SegmentBuffer:
                        # Create a fragment every TARGET_PART_DURATION. The data from each fragment is stored in
                        # a "Part" that can be combined with the data from all the other "Part"s, plus an init
                        # section, to reconstitute the data in a "Segment".
-                        # frag_duration is the threshold for determining part boundaries, and the dts of the last
-                        # packet in the part should correspond to a duration that is smaller than this value.
-                        # However, as the part duration includes the duration of the last frame, the part duration
-                        # will be equal to or greater than this value.
-                        # We previously scaled this number down by .85 to account for this while keeping within
-                        # the 15% variance allowed in part duration. However, this did not work when inputs had
-                        # an audio stream - sometimes the fragment would get cut on the audio packet, causing
-                        # the durations to actually be to short.
-                        # The current approach is to use this frag_duration for creating the media while
-                        # adjusting the metadata duration to keep the durations in the metadata below the
-                        # part_target_duration threshold.
+                        # The LL-HLS spec allows for a fragment's duration to be within the range [0.85x,1.0x]
+                        # of the part target duration. We use the frag_duration option to tell ffmpeg to try to
+                        # cut the fragments when they reach frag_duration. However, the resulting fragments can
+                        # have variability in their durations and can end up being too short or too long. If
+                        # there are two tracks, as in the case of a video feed with audio, the fragment cut seems
+                        # to be done on the first track that crosses the desired threshold, and cutting on the
+                        # audio track may result in a shorter video fragment than desired. Conversely, with a
+                        # video track with no audio, the discrete nature of frames means that the frame at the
+                        # end of a fragment will sometimes extend slightly beyond the desired frag_duration.
+                        # Given this, our approach is to use a frag_duration near the upper end of the range for
+                        # outputs with audio using a frag_duration at the lower end of the range for outputs with
+                        # only video.
                        "frag_duration": str(
-                            self._stream_settings.part_target_duration * 1e6
+                            self._stream_settings.part_target_duration
+                            * (98e4 if add_audio else 9e5)
                        ),
                    }
                    if self._stream_settings.ll_hls
@ -113,6 +121,12 @@ class SegmentBuffer:
                ),
            },
        )
+        output_vstream = container.add_stream(template=input_vstream)
+        # Check if audio is requested
+        output_astream = None
+        if add_audio:
+            output_astream = container.add_stream(template=input_astream)
+        return container, output_vstream, output_astream

    def set_streams(
        self,
@ -128,26 +142,22 @@ class SegmentBuffer:
        """Initialize a new stream segment."""
        # Keep track of the number of segments we've processed
        self._sequence += 1
-        self._segment_start_dts = video_dts
+        self._part_start_dts = self._segment_start_dts = video_dts
        self._segment = None
        self._memory_file = BytesIO()
        self._memory_file_pos = 0
-        self._av_output = self.make_new_av(
+        (
+            self._av_output,
+            self._output_video_stream,
+            self._output_audio_stream,
+        ) = self.make_new_av(
            memory_file=self._memory_file,
            sequence=self._sequence,
            input_vstream=self._input_video_stream,
-        )
-        self._output_video_stream = self._av_output.add_stream(
-            template=self._input_video_stream
+            input_astream=self._input_audio_stream,
        )
        if self._output_video_stream.name == "hevc":
            self._output_video_stream.codec_tag = "hvc1"
-        # Check if audio is requested
-        self._output_audio_stream = None
-        if self._input_audio_stream and self._input_audio_stream.name in AUDIO_CODECS:
-            self._output_audio_stream = self._av_output.add_stream(
-                template=self._input_audio_stream
-            )

    def mux_packet(self, packet: av.Packet) -> None:
        """Mux a packet to the appropriate output stream."""
@ -186,13 +196,9 @@ class SegmentBuffer:
                # Fetch the latest StreamOutputs, which may have changed since the
                # worker started.
                stream_outputs=self._outputs_callback().values(),
-                start_time=self._start_time
-                + datetime.timedelta(
-                    seconds=float(self._segment_start_dts * packet.time_base)
-                ),
+                start_time=self._start_time,
            )
            self._memory_file_pos = self._memory_file.tell()
-            self._part_start_dts = self._segment_start_dts
        else:  # These are the ends of the part segments
            self.flush(packet, last_part=False)

@ -201,17 +207,23 @@ class SegmentBuffer:

        If last_part is True, also close the segment, give it a duration,
        and clean up the av_output and memory_file.
+        There are two different ways to enter this function, and when
+        last_part is True, packet has not yet been muxed, while when
+        last_part is False, the packet has already been muxed. However,
+        in both cases, packet is the next packet and is not included in
+        the Part.
+        This function writes the duration metadata for the Part and
+        for the Segment. However, as the fragmentation done by ffmpeg
+        may result in fragment durations which fall outside the
+        [0.85x,1.0x] tolerance band allowed by LL-HLS, we need to fudge
+        some durations a bit by reporting them as being within that
+        range.
+        Note that repeated adjustments may cause drift between the part
+        durations in the metadata and those in the media and result in
+        playback issues in some clients.
        """
-        # In some cases using the current packet's dts (which is the start
-        # dts of the next part) to calculate the part duration will result in a
-        # value which exceeds the part_target_duration. This can muck up the
-        # duration of both this part and the next part. An easy fix is to just
-        # use the current packet dts and cap it by the part target duration.
-        # The adjustment may cause a drift between this adjusted duration
-        # (used in the metadata) and the media duration, but the drift should be
-        # automatically corrected when the part duration cleanly divides the
-        # framerate.
-        current_dts = min(
+        # Part durations should not exceed the part target duration
+        adjusted_dts = min(
            packet.dts,
            self._part_start_dts
            + self._stream_settings.part_target_duration / packet.time_base,
@ -220,29 +232,44 @@ class SegmentBuffer:
            # Closing the av_output will write the remaining buffered data to the
            # memory_file as a new moof/mdat.
            self._av_output.close()
+        elif not self._part_has_keyframe:
+            # Parts which are not the last part or an independent part should
+            # not have durations below 0.85 of the part target duration.
+            adjusted_dts = max(
+                adjusted_dts,
+                self._part_start_dts
+                + 0.85 * self._stream_settings.part_target_duration / packet.time_base,
+            )
        assert self._segment
        self._memory_file.seek(self._memory_file_pos)
        self._hass.loop.call_soon_threadsafe(
            self._segment.async_add_part,
            Part(
-                duration=float((current_dts - self._part_start_dts) * packet.time_base),
+                duration=float(
+                    (adjusted_dts - self._part_start_dts) * packet.time_base
+                ),
                has_keyframe=self._part_has_keyframe,
                data=self._memory_file.read(),
            ),
-            float((current_dts - self._segment_start_dts) * packet.time_base)
+            (
+                segment_duration := float(
+                    (adjusted_dts - self._segment_start_dts) * packet.time_base
+                )
+            )
            if last_part
            else 0,
        )
        if last_part:
            # If we've written the last part, we can close the memory_file.
            self._memory_file.close()  # We don't need the BytesIO object anymore
+            self._start_time += datetime.timedelta(seconds=segment_duration)
            # Reinitialize
-            self.reset(current_dts)
+            self.reset(packet.dts)
        else:
            # For the last part, these will get set again elsewhere so we can skip
            # setting them here.
            self._memory_file_pos = self._memory_file.tell()
-            self._part_start_dts = current_dts
+            self._part_start_dts = adjusted_dts
        self._part_has_keyframe = False

    def discontinuity(self) -> None:
--- a/tests/components/stream/test_worker.py
+++ b/tests/components/stream/test_worker.py
@ -677,6 +677,10 @@ async def test_worker_log(hass, caplog):

 async def test_durations(hass, record_worker_sync):
    """Test that the duration metadata matches the media."""
+
+    # Use a target part duration which has a slight mismatch
+    # with the incoming frame rate to better expose problems.
+    target_part_duration = TEST_PART_DURATION - 0.01
    await async_setup_component(
        hass,
        "stream",
@ -684,12 +688,12 @@ async def test_durations(hass, record_worker_sync):
            "stream": {
                CONF_LL_HLS: True,
                CONF_SEGMENT_DURATION: SEGMENT_DURATION,
-                CONF_PART_DURATION: TEST_PART_DURATION,
+                CONF_PART_DURATION: target_part_duration,
            }
        },
    )

-    source = generate_h264_video()
+    source = generate_h264_video(duration=SEGMENT_DURATION + 1)
    stream = create_stream(hass, source, {})

    # use record_worker_sync to grab output segments
@ -702,25 +706,37 @@ async def test_durations(hass, record_worker_sync):
    # check that the Part duration metadata matches the durations in the media
    running_metadata_duration = 0
    for segment in complete_segments:
-        for part in segment.parts:
+        av_segment = av.open(io.BytesIO(segment.init + segment.get_data()))
+        av_segment.close()
+        for part_num, part in enumerate(segment.parts):
            av_part = av.open(io.BytesIO(segment.init + part.data))
            running_metadata_duration += part.duration
-            # av_part.duration actually returns the dts of the first packet of
-            # the next av_part. When we normalize this by av.time_base we get
-            # the running duration of the media.
-            # The metadata duration is slightly different. The worker has
-            # some flexibility of where to set each metadata boundary, and
-            # when the media's duration is slightly too long, the metadata
-            # duration is adjusted down. This means that the running metadata
-            # duration may be up to one video frame duration smaller than the
-            # part duration.
-            assert running_metadata_duration < av_part.duration / av.time_base + 1e-6
-            assert (
-                running_metadata_duration
-                > av_part.duration / av.time_base
-                - 1 / av_part.streams.video[0].rate
-                - 1e-6
+            # av_part.duration actually returns the dts of the first packet of the next
+            # av_part. When we normalize this by av.time_base we get the running
+            # duration of the media.
+            # The metadata duration may differ slightly from the media duration.
+            # The worker has some flexibility of where to set each metadata boundary,
+            # and when the media's duration is slightly too long or too short, the
+            # metadata duration may be adjusted up or down.
+            # We check here that the divergence between the metadata duration and the
+            # media duration is not too large (2 frames seems reasonable here).
+            assert math.isclose(
+                (av_part.duration - av_part.start_time) / av.time_base,
+                part.duration,
+                abs_tol=2 / av_part.streams.video[0].rate + 1e-6,
            )
+            # Also check that the sum of the durations so far matches the last dts
+            # in the media.
+            assert math.isclose(
+                running_metadata_duration,
+                av_part.duration / av.time_base,
+                abs_tol=1e-6,
+            )
+            # And check that the metadata duration is between 0.85x and 1.0x of
+            # the part target duration
+            if not (part.has_keyframe or part_num == len(segment.parts) - 1):
+                assert part.duration > 0.85 * target_part_duration - 1e-6
+            assert part.duration < target_part_duration + 1e-6
            av_part.close()
    # check that the Part durations are consistent with the Segment durations
    for segment in complete_segments: