diff --git a/libp2p/stream_muxer/mplex/mplex_stream.py b/libp2p/stream_muxer/mplex/mplex_stream.py index 3b640df1..77546036 100644 --- a/libp2p/stream_muxer/mplex/mplex_stream.py +++ b/libp2p/stream_muxer/mplex/mplex_stream.py @@ -1,3 +1,5 @@ +from collections.abc import AsyncGenerator +from contextlib import asynccontextmanager from types import ( TracebackType, ) @@ -32,6 +34,72 @@ if TYPE_CHECKING: ) +class ReadWriteLock: + """ + A read-write lock that allows multiple concurrent readers + or one exclusive writer, implemented using Trio primitives. + """ + + def __init__(self) -> None: + self._readers = 0 + self._readers_lock = trio.Lock() # Protects access to _readers count + self._writer_lock = trio.Semaphore(1) # Allows only one writer at a time + + async def acquire_read(self) -> None: + """Acquire a read lock. Multiple readers can hold it simultaneously.""" + try: + async with self._readers_lock: + if self._readers == 0: + await self._writer_lock.acquire() + self._readers += 1 + except trio.Cancelled: + raise + + async def release_read(self) -> None: + """Release a read lock.""" + async with self._readers_lock: + if self._readers == 1: + self._writer_lock.release() + self._readers -= 1 + + async def acquire_write(self) -> None: + """Acquire an exclusive write lock.""" + try: + await self._writer_lock.acquire() + except trio.Cancelled: + raise + + def release_write(self) -> None: + """Release the exclusive write lock.""" + self._writer_lock.release() + + @asynccontextmanager + async def read_lock(self) -> AsyncGenerator[None, None]: + """Context manager for acquiring and releasing a read lock safely.""" + acquire = False + try: + await self.acquire_read() + acquire = True + yield + finally: + if acquire: + with trio.CancelScope() as scope: + scope.shield = True + await self.release_read() + + @asynccontextmanager + async def write_lock(self) -> AsyncGenerator[None, None]: + """Context manager for acquiring and releasing a write lock safely.""" + acquire = False + try: + await self.acquire_write() + acquire = True + yield + finally: + if acquire: + self.release_write() + + class MplexStream(IMuxedStream): """ reference: https://github.com/libp2p/go-mplex/blob/master/stream.go @@ -46,7 +114,7 @@ class MplexStream(IMuxedStream): read_deadline: int | None write_deadline: int | None - # TODO: Add lock for read/write to avoid interleaving receiving messages? + rw_lock: ReadWriteLock close_lock: trio.Lock # NOTE: `dataIn` is size of 8 in Go implementation. @@ -80,6 +148,7 @@ class MplexStream(IMuxedStream): self.event_remote_closed = trio.Event() self.event_reset = trio.Event() self.close_lock = trio.Lock() + self.rw_lock = ReadWriteLock() self.incoming_data_channel = incoming_data_channel self._buf = bytearray() @@ -113,48 +182,49 @@ class MplexStream(IMuxedStream): :param n: number of bytes to read :return: bytes actually read """ - if n is not None and n < 0: - raise ValueError( - "the number of bytes to read `n` must be non-negative or " - f"`None` to indicate read until EOF, got n={n}" - ) - if self.event_reset.is_set(): - raise MplexStreamReset - if n is None: - return await self._read_until_eof() - if len(self._buf) == 0: - data: bytes - # Peek whether there is data available. If yes, we just read until there is - # no data, then return. - try: - data = self.incoming_data_channel.receive_nowait() - self._buf.extend(data) - except trio.EndOfChannel: - raise MplexStreamEOF - except trio.WouldBlock: - # We know `receive` will be blocked here. Wait for data here with - # `receive` and catch all kinds of errors here. + async with self.rw_lock.read_lock(): + if n is not None and n < 0: + raise ValueError( + "the number of bytes to read `n` must be non-negative or " + f"`None` to indicate read until EOF, got n={n}" + ) + if self.event_reset.is_set(): + raise MplexStreamReset + if n is None: + return await self._read_until_eof() + if len(self._buf) == 0: + data: bytes + # Peek whether there is data available. If yes, we just read until + # there is no data, then return. try: - data = await self.incoming_data_channel.receive() + data = self.incoming_data_channel.receive_nowait() self._buf.extend(data) except trio.EndOfChannel: - if self.event_reset.is_set(): - raise MplexStreamReset - if self.event_remote_closed.is_set(): - raise MplexStreamEOF - except trio.ClosedResourceError as error: - # Probably `incoming_data_channel` is closed in `reset` when we are - # waiting for `receive`. - if self.event_reset.is_set(): - raise MplexStreamReset - raise Exception( - "`incoming_data_channel` is closed but stream is not reset. " - "This should never happen." - ) from error - self._buf.extend(self._read_return_when_blocked()) - payload = self._buf[:n] - self._buf = self._buf[len(payload) :] - return bytes(payload) + raise MplexStreamEOF + except trio.WouldBlock: + # We know `receive` will be blocked here. Wait for data here with + # `receive` and catch all kinds of errors here. + try: + data = await self.incoming_data_channel.receive() + self._buf.extend(data) + except trio.EndOfChannel: + if self.event_reset.is_set(): + raise MplexStreamReset + if self.event_remote_closed.is_set(): + raise MplexStreamEOF + except trio.ClosedResourceError as error: + # Probably `incoming_data_channel` is closed in `reset` when + # we are waiting for `receive`. + if self.event_reset.is_set(): + raise MplexStreamReset + raise Exception( + "`incoming_data_channel` is closed but stream is not reset." + "This should never happen." + ) from error + self._buf.extend(self._read_return_when_blocked()) + payload = self._buf[:n] + self._buf = self._buf[len(payload) :] + return bytes(payload) async def write(self, data: bytes) -> None: """ @@ -162,14 +232,15 @@ class MplexStream(IMuxedStream): :return: number of bytes written """ - if self.event_local_closed.is_set(): - raise MplexStreamClosed(f"cannot write to closed stream: data={data!r}") - flag = ( - HeaderTags.MessageInitiator - if self.is_initiator - else HeaderTags.MessageReceiver - ) - await self.muxed_conn.send_message(flag, data, self.stream_id) + async with self.rw_lock.write_lock(): + if self.event_local_closed.is_set(): + raise MplexStreamClosed(f"cannot write to closed stream: data={data!r}") + flag = ( + HeaderTags.MessageInitiator + if self.is_initiator + else HeaderTags.MessageReceiver + ) + await self.muxed_conn.send_message(flag, data, self.stream_id) async def close(self) -> None: """ diff --git a/newsfragments/748.feature.rst b/newsfragments/748.feature.rst new file mode 100644 index 00000000..199e5b3b --- /dev/null +++ b/newsfragments/748.feature.rst @@ -0,0 +1 @@ + Add lock for read/write to avoid interleaving receiving messages in mplex_stream.py diff --git a/tests/core/stream_muxer/test_read_write_lock.py b/tests/core/stream_muxer/test_read_write_lock.py new file mode 100644 index 00000000..621f3841 --- /dev/null +++ b/tests/core/stream_muxer/test_read_write_lock.py @@ -0,0 +1,590 @@ +from typing import Any, cast +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +import trio +from trio.testing import wait_all_tasks_blocked + +from libp2p.stream_muxer.exceptions import ( + MuxedConnUnavailable, +) +from libp2p.stream_muxer.mplex.constants import HeaderTags +from libp2p.stream_muxer.mplex.datastructures import StreamID +from libp2p.stream_muxer.mplex.exceptions import ( + MplexStreamClosed, + MplexStreamEOF, + MplexStreamReset, +) +from libp2p.stream_muxer.mplex.mplex_stream import MplexStream + + +class MockMuxedConn: + """A mock Mplex connection for testing purposes.""" + + def __init__(self): + self.sent_messages = [] + self.streams: dict[StreamID, MplexStream] = {} + self.streams_lock = trio.Lock() + self.is_unavailable = False + + async def send_message( + self, flag: HeaderTags, data: bytes | None, stream_id: StreamID + ) -> None: + """Mocks sending a message over the connection.""" + if self.is_unavailable: + raise MuxedConnUnavailable("Connection is unavailable") + self.sent_messages.append((flag, data, stream_id)) + # Yield to allow other tasks to run + await trio.lowlevel.checkpoint() + + def get_remote_address(self) -> tuple[str, int]: + """Mocks getting the remote address.""" + return "127.0.0.1", 4001 + + +@pytest.fixture +async def mplex_stream(): + """Provides a fully initialized MplexStream and its communication channels.""" + # Use a buffered channel to prevent deadlocks in simple tests + send_chan, recv_chan = trio.open_memory_channel(10) + stream_id = StreamID(1, is_initiator=True) + muxed_conn = MockMuxedConn() + stream = MplexStream("test-stream", stream_id, cast(Any, muxed_conn), recv_chan) + muxed_conn.streams[stream_id] = stream + + yield stream, send_chan, muxed_conn + + # Cleanup: Close channels and reset stream state + await send_chan.aclose() + await recv_chan.aclose() + # Reset stream state to prevent cross-test contamination + stream.event_local_closed = trio.Event() + stream.event_remote_closed = trio.Event() + stream.event_reset = trio.Event() + + +# =============================================== +# 1. Tests for Stream-Level Lock Integration +# =============================================== + + +@pytest.mark.trio +async def test_stream_write_is_protected_by_rwlock(mplex_stream): + """Verify that stream.write() acquires and releases the write lock.""" + stream, _, muxed_conn = mplex_stream + + # Mock lock methods + original_acquire = stream.rw_lock.acquire_write + original_release = stream.rw_lock.release_write + + stream.rw_lock.acquire_write = AsyncMock(wraps=original_acquire) + stream.rw_lock.release_write = MagicMock(wraps=original_release) + + await stream.write(b"test data") + + stream.rw_lock.acquire_write.assert_awaited_once() + stream.rw_lock.release_write.assert_called_once() + + # Verify the message was actually sent + assert len(muxed_conn.sent_messages) == 1 + flag, data, stream_id = muxed_conn.sent_messages[0] + assert flag == HeaderTags.MessageInitiator + assert data == b"test data" + assert stream_id == stream.stream_id + + +@pytest.mark.trio +async def test_stream_read_is_protected_by_rwlock(mplex_stream): + """Verify that stream.read() acquires and releases the read lock.""" + stream, send_chan, _ = mplex_stream + + # Mock lock methods + original_acquire = stream.rw_lock.acquire_read + original_release = stream.rw_lock.release_read + + stream.rw_lock.acquire_read = AsyncMock(wraps=original_acquire) + stream.rw_lock.release_read = AsyncMock(wraps=original_release) + + await send_chan.send(b"hello") + result = await stream.read(5) + + stream.rw_lock.acquire_read.assert_awaited_once() + stream.rw_lock.release_read.assert_awaited_once() + assert result == b"hello" + + +@pytest.mark.trio +async def test_multiple_readers_can_coexist(mplex_stream): + """Verify multiple readers can operate concurrently.""" + stream, send_chan, _ = mplex_stream + + # Send enough data for both reads + await send_chan.send(b"data1") + await send_chan.send(b"data2") + + # Track lock acquisition order + acquisition_order = [] + release_order = [] + + # Patch lock methods to track concurrency + original_acquire = stream.rw_lock.acquire_read + original_release = stream.rw_lock.release_read + + async def tracked_acquire(): + nonlocal acquisition_order + acquisition_order.append("start") + await original_acquire() + acquisition_order.append("acquired") + + async def tracked_release(): + nonlocal release_order + release_order.append("start") + await original_release() + release_order.append("released") + + with ( + patch.object( + stream.rw_lock, "acquire_read", side_effect=tracked_acquire, autospec=True + ), + patch.object( + stream.rw_lock, "release_read", side_effect=tracked_release, autospec=True + ), + ): + # Execute concurrent reads + async with trio.open_nursery() as nursery: + nursery.start_soon(stream.read, 5) + nursery.start_soon(stream.read, 5) + + # Verify both reads happened + assert acquisition_order.count("start") == 2 + assert acquisition_order.count("acquired") == 2 + assert release_order.count("start") == 2 + assert release_order.count("released") == 2 + + +@pytest.mark.trio +async def test_writer_blocks_readers(mplex_stream): + """Verify that a writer blocks all readers and new readers queue behind.""" + stream, send_chan, _ = mplex_stream + + writer_acquired = trio.Event() + readers_ready = trio.Event() + writer_finished = trio.Event() + all_readers_started = trio.Event() + all_readers_done = trio.Event() + + counters = {"reader_start_count": 0, "reader_done_count": 0} + reader_target = 3 + reader_start_lock = trio.Lock() + + # Patch write lock to control test flow + original_acquire_write = stream.rw_lock.acquire_write + original_release_write = stream.rw_lock.release_write + + async def tracked_acquire_write(): + await original_acquire_write() + writer_acquired.set() + # Wait for readers to queue up + await readers_ready.wait() + + # Must be synchronous since real release_write is sync + def tracked_release_write(): + original_release_write() + writer_finished.set() + + with ( + patch.object( + stream.rw_lock, "acquire_write", side_effect=tracked_acquire_write + ), + patch.object( + stream.rw_lock, "release_write", side_effect=tracked_release_write + ), + ): + async with trio.open_nursery() as nursery: + # Start writer + nursery.start_soon(stream.write, b"test") + await writer_acquired.wait() + + # Start readers + async def reader_task(): + async with reader_start_lock: + counters["reader_start_count"] += 1 + if counters["reader_start_count"] == reader_target: + all_readers_started.set() + + try: + # This will block until data is available + await stream.read(5) + except (MplexStreamReset, MplexStreamEOF): + pass + finally: + async with reader_start_lock: + counters["reader_done_count"] += 1 + if counters["reader_done_count"] == reader_target: + all_readers_done.set() + + for _ in range(reader_target): + nursery.start_soon(reader_task) + + # Wait until all readers are started + await all_readers_started.wait() + + # Let the writer finish and release the lock + readers_ready.set() + await writer_finished.wait() + + # Send data to unblock the readers + for i in range(reader_target): + await send_chan.send(b"data" + str(i).encode()) + + # Wait for all readers to finish + await all_readers_done.wait() + + +@pytest.mark.trio +async def test_writer_waits_for_readers(mplex_stream): + """Verify a writer waits for existing readers to complete.""" + stream, send_chan, _ = mplex_stream + readers_started = trio.Event() + writer_entered = trio.Event() + writer_acquiring = trio.Event() + readers_finished = trio.Event() + + # Send data for readers + await send_chan.send(b"data1") + await send_chan.send(b"data2") + + # Patch read lock to control test flow + original_acquire_read = stream.rw_lock.acquire_read + + async def tracked_acquire_read(): + await original_acquire_read() + readers_started.set() + # Wait until readers are allowed to finish + await readers_finished.wait() + + # Patch write lock to detect when writer is blocked + original_acquire_write = stream.rw_lock.acquire_write + + async def tracked_acquire_write(): + writer_acquiring.set() + await original_acquire_write() + writer_entered.set() + + with ( + patch.object(stream.rw_lock, "acquire_read", side_effect=tracked_acquire_read), + patch.object( + stream.rw_lock, "acquire_write", side_effect=tracked_acquire_write + ), + ): + async with trio.open_nursery() as nursery: + # Start readers + nursery.start_soon(stream.read, 5) + nursery.start_soon(stream.read, 5) + + # Wait for at least one reader to acquire the lock + await readers_started.wait() + + # Start writer (should block) + nursery.start_soon(stream.write, b"test") + + # Wait for writer to start acquiring lock + await writer_acquiring.wait() + + # Verify writer hasn't entered critical section + assert not writer_entered.is_set() + + # Allow readers to finish + readers_finished.set() + + # Verify writer can proceed + await writer_entered.wait() + + +@pytest.mark.trio +async def test_lock_behavior_during_cancellation(mplex_stream): + """Verify that a lock is released when a task holding it is cancelled.""" + stream, _, _ = mplex_stream + + reader_acquired_lock = trio.Event() + + async def cancellable_reader(task_status): + async with stream.rw_lock.read_lock(): + reader_acquired_lock.set() + task_status.started() + # Wait indefinitely until cancelled. + await trio.sleep_forever() + + async with trio.open_nursery() as nursery: + # Start the reader and wait for it to acquire the lock. + await nursery.start(cancellable_reader) + await reader_acquired_lock.wait() + + # Now that the reader has the lock, cancel the nursery. + # This will cancel the reader task, and its lock should be released. + nursery.cancel_scope.cancel() + + # After the nursery is cancelled, the reader should have released the lock. + # To verify, we try to acquire a write lock. If the read lock was not + # released, this will time out. + with trio.move_on_after(1) as cancel_scope: + async with stream.rw_lock.write_lock(): + pass + if cancel_scope.cancelled_caught: + pytest.fail( + "Write lock could not be acquired after a cancelled reader, " + "indicating the read lock was not released." + ) + + +@pytest.mark.trio +async def test_concurrent_read_write_sequence(mplex_stream): + """Verify complex sequence of interleaved reads and writes.""" + stream, send_chan, _ = mplex_stream + results = [] + # Use a mock to intercept writes and feed them back to the read channel + original_write = stream.write + + reader1_finished = trio.Event() + writer1_finished = trio.Event() + reader2_finished = trio.Event() + + async def mocked_write(data: bytes) -> None: + await original_write(data) + # Simulate the other side receiving the data and sending a response + # by putting data into the read channel. + await send_chan.send(data) + + with patch.object(stream, "write", wraps=mocked_write) as patched_write: + async with trio.open_nursery() as nursery: + # Test scenario: + # 1. Reader 1 starts, waits for data. + # 2. Writer 1 writes, which gets fed back to the stream. + # 3. Reader 2 starts, reads what Writer 1 wrote. + # 4. Writer 2 writes. + + async def reader1(): + nonlocal results + results.append("R1 start") + data = await stream.read(5) + results.append(data) + results.append("R1 done") + reader1_finished.set() + + async def writer1(): + nonlocal results + await reader1_finished.wait() + results.append("W1 start") + await stream.write(b"write1") + results.append("W1 done") + writer1_finished.set() + + async def reader2(): + nonlocal results + await writer1_finished.wait() + # This will read the data from writer1 + results.append("R2 start") + data = await stream.read(6) + results.append(data) + results.append("R2 done") + reader2_finished.set() + + async def writer2(): + nonlocal results + await reader2_finished.wait() + results.append("W2 start") + await stream.write(b"write2") + results.append("W2 done") + + # Execute sequence + nursery.start_soon(reader1) + nursery.start_soon(writer1) + nursery.start_soon(reader2) + nursery.start_soon(writer2) + + await send_chan.send(b"data1") + + # Verify sequence and that write was called + assert patched_write.call_count == 2 + assert results == [ + "R1 start", + b"data1", + "R1 done", + "W1 start", + "W1 done", + "R2 start", + b"write1", + "R2 done", + "W2 start", + "W2 done", + ] + + +# =============================================== +# 2. Tests for Reset, EOF, and Close Interactions +# =============================================== + + +@pytest.mark.trio +async def test_read_after_remote_close_triggers_eof(mplex_stream): + """Verify reading from a remotely closed stream returns EOF correctly.""" + stream, send_chan, _ = mplex_stream + + # Send some data that can be read first + await send_chan.send(b"data") + # Close the channel to signify no more data will ever arrive + await send_chan.aclose() + + # Mark the stream as remotely closed + stream.event_remote_closed.set() + + # The first read should succeed, consuming the buffered data + data = await stream.read(4) + assert data == b"data" + + # Now that the buffer is empty and the channel is closed, this should raise EOF + with pytest.raises(MplexStreamEOF): + await stream.read(1) + + +@pytest.mark.trio +async def test_read_on_closed_stream_raises_eof(mplex_stream): + """Test that reading from a closed stream with no data raises EOF.""" + stream, send_chan, _ = mplex_stream + stream.event_remote_closed.set() + await send_chan.aclose() # Ensure the channel is closed + + # Reading from a stream that is closed and has no data should raise EOF + with pytest.raises(MplexStreamEOF): + await stream.read(100) + + +@pytest.mark.trio +async def test_write_to_locally_closed_stream_raises(mplex_stream): + """Verify writing to a locally closed stream raises MplexStreamClosed.""" + stream, _, _ = mplex_stream + stream.event_local_closed.set() + + with pytest.raises(MplexStreamClosed): + await stream.write(b"this should fail") + + +@pytest.mark.trio +async def test_read_from_reset_stream_raises(mplex_stream): + """Verify reading from a reset stream raises MplexStreamReset.""" + stream, _, _ = mplex_stream + stream.event_reset.set() + + with pytest.raises(MplexStreamReset): + await stream.read(10) + + +@pytest.mark.trio +async def test_write_to_reset_stream_raises(mplex_stream): + """Verify writing to a reset stream raises MplexStreamClosed.""" + stream, _, _ = mplex_stream + # A stream reset implies it's also locally closed. + await stream.reset() + + # The `write` method checks `event_local_closed`, which `reset` sets. + with pytest.raises(MplexStreamClosed): + await stream.write(b"this should also fail") + + +@pytest.mark.trio +async def test_stream_reset_cleans_up_resources(mplex_stream): + """Verify reset() cleans up stream state and resources.""" + stream, _, muxed_conn = mplex_stream + stream_id = stream.stream_id + + assert stream_id in muxed_conn.streams + await stream.reset() + + assert stream.event_reset.is_set() + assert stream.event_local_closed.is_set() + assert stream.event_remote_closed.is_set() + assert (HeaderTags.ResetInitiator, None, stream_id) in muxed_conn.sent_messages + assert stream_id not in muxed_conn.streams + # Verify the underlying data channel is closed + with pytest.raises(trio.ClosedResourceError): + await stream.incoming_data_channel.receive() + + +# =============================================== +# 3. Rigorous Concurrency Tests with Events +# =============================================== + + +@pytest.mark.trio +async def test_writer_is_blocked_by_reader_using_events(mplex_stream): + """Verify a writer must wait for a reader using trio.Event for synchronization.""" + stream, _, _ = mplex_stream + + reader_has_lock = trio.Event() + writer_finished = trio.Event() + + async def reader(): + async with stream.rw_lock.read_lock(): + reader_has_lock.set() + # Hold the lock until the writer has finished its attempt + await writer_finished.wait() + + async def writer(): + await reader_has_lock.wait() + # This call will now block until the reader releases the lock + await stream.write(b"data") + writer_finished.set() + + async with trio.open_nursery() as nursery: + nursery.start_soon(reader) + nursery.start_soon(writer) + + # Verify writer is blocked + await wait_all_tasks_blocked() + assert not writer_finished.is_set() + + # Signal the reader to finish + writer_finished.set() + + +@pytest.mark.trio +async def test_multiple_readers_can_read_concurrently_using_events(mplex_stream): + """Verify that multiple readers can acquire a read lock simultaneously.""" + stream, _, _ = mplex_stream + + counters = {"readers_in_critical_section": 0} + lock = trio.Lock() # To safely mutate the counter + + reader1_acquired = trio.Event() + reader2_acquired = trio.Event() + all_readers_finished = trio.Event() + + async def concurrent_reader(event_to_set: trio.Event): + async with stream.rw_lock.read_lock(): + async with lock: + counters["readers_in_critical_section"] += 1 + event_to_set.set() + # Wait until all readers have finished before exiting the lock context + await all_readers_finished.wait() + async with lock: + counters["readers_in_critical_section"] -= 1 + + async with trio.open_nursery() as nursery: + nursery.start_soon(concurrent_reader, reader1_acquired) + nursery.start_soon(concurrent_reader, reader2_acquired) + + # Wait for both readers to acquire their locks + await reader1_acquired.wait() + await reader2_acquired.wait() + + # Check that both were in the critical section at the same time + async with lock: + assert counters["readers_in_critical_section"] == 2 + + # Signal for all readers to finish + all_readers_finished.set() + + # Verify they exit cleanly + await wait_all_tasks_blocked() + async with lock: + assert counters["readers_in_critical_section"] == 0