simplify bootstrap discovery with optimized timeouts

This commit is contained in:
ankur12-1610
2025-08-26 01:41:26 +05:30
parent 3b27b02a8b
commit c940dac1e6

View File

@ -17,8 +17,8 @@ class BootstrapDiscovery:
"""
Bootstrap-based peer discovery for py-libp2p.
Uses Trio nurseries for parallel address resolution and connection attempts.
Connects to predefined bootstrap peers and adds them to peerstore.
Processes bootstrap addresses in parallel and attempts initial connections.
Adds discovered peers to peerstore for network bootstrapping.
"""
def __init__(self, swarm: INetworkService, bootstrap_addrs: list[str]):
@ -29,15 +29,15 @@ class BootstrapDiscovery:
swarm: The network service (swarm) instance
bootstrap_addrs: List of bootstrap peer multiaddresses
Note: Connection maintenance is always enabled to ensure reliable connectivity.
"""
self.swarm = swarm
self.peerstore = swarm.peerstore
self.bootstrap_addrs = bootstrap_addrs or []
self.discovered_peers: set[str] = set()
self.connected_bootstrap_peers: set[ID] = set()
self._disconnect_monitor_running = False
self.connection_timeout: int = 10
self.connected_peers: set[ID] = (
set()
) # Track connected peers for drop detection
async def start(self) -> None:
"""Process bootstrap addresses and emit peer discovery events in parallel."""
@ -71,7 +71,7 @@ class BootstrapDiscovery:
# Start all bootstrap address processing tasks in parallel
for addr_str in self.bootstrap_addrs:
logger.info(f"Starting parallel task for: {addr_str}")
nursery.start_soon(self._process_bootstrap_addr_safe, addr_str)
nursery.start_soon(self._process_bootstrap_addr, addr_str)
# The nursery will wait for all address processing tasks to complete
logger.info(
@ -87,20 +87,13 @@ class BootstrapDiscovery:
logger.info("Bootstrap discovery startup complete - all tasks finished")
# Always start disconnect monitoring for reliable connectivity
if not self._disconnect_monitor_running:
trio.lowlevel.spawn_system_task(self._monitor_disconnections)
def stop(self) -> None:
"""Clean up bootstrap discovery resources and stop all background tasks."""
"""Clean up bootstrap discovery resources."""
logger.info("Stopping bootstrap discovery and cleaning up tasks")
# Clear discovered peers
self.discovered_peers.clear()
self.connected_bootstrap_peers.clear()
# Mark disconnect monitor as stopped
self._disconnect_monitor_running = False
self.connected_peers.clear()
logger.debug("Bootstrap discovery cleanup completed")
@ -164,7 +157,7 @@ class BootstrapDiscovery:
logger.debug(f"Skipping own peer ID: {peer_info.peer_id}")
return
# Filter addresses to only include IPv4+TCP (restrict dialing attempts)
# Filter addresses to only include IPv4+TCP (only supported protocol)
ipv4_tcp_addrs = []
filtered_out_addrs = []
@ -174,12 +167,9 @@ class BootstrapDiscovery:
else:
filtered_out_addrs.append(addr)
# Log filtering results with fallback strategy details
# Log filtering results
logger.info(f"Address filtering for {peer_info.peer_id}:")
logger.info(
f"IPv4+TCP addresses: {len(ipv4_tcp_addrs)} "
f"(will be tried in sequence for fallback)"
)
logger.info(f"IPv4+TCP addresses: {len(ipv4_tcp_addrs)}")
logger.info(f"Filtered out: {len(filtered_out_addrs)} (unsupported protocols)")
# Show filtered addresses for debugging
@ -187,11 +177,11 @@ class BootstrapDiscovery:
for addr in filtered_out_addrs:
logger.debug(f"Filtered: {addr}")
# Show addresses that will be used for fallback
# Show addresses that will be used
if ipv4_tcp_addrs:
logger.debug("Addresses for fallback attempts:")
logger.debug("Usable addresses:")
for i, addr in enumerate(ipv4_tcp_addrs, 1):
logger.debug(f" Fallback {i}: {addr}")
logger.debug(f" Address {i}: {addr}")
# Skip peer if no IPv4+TCP addresses available
if not ipv4_tcp_addrs:
@ -202,12 +192,10 @@ class BootstrapDiscovery:
return
logger.info(
f"Will attempt connection with automatic fallback through "
f"{len(ipv4_tcp_addrs)} IPv4+TCP addresses"
f"Will attempt connection using {len(ipv4_tcp_addrs)} IPv4+TCP addresses"
)
# Add only IPv4+TCP addresses to peerstore
# (restrict dialing to supported protocols)
self.peerstore.add_addrs(peer_info.peer_id, ipv4_tcp_addrs, 0)
# Allow other tasks to run after adding to peerstore
@ -268,10 +256,10 @@ class BootstrapDiscovery:
async def _connect_to_peer(self, peer_id: ID) -> None:
"""
Attempt to establish a connection to a peer with fallback logic.
Attempt to establish a connection to a peer with timeout.
Uses swarm.dial_peer which tries all available addresses for the peer
in sequence until one succeeds or all fail.
Uses swarm.dial_peer to connect using addresses stored in peerstore.
Times out after connection_timeout seconds to prevent hanging.
"""
logger.info(f"Connection attempt for peer: {peer_id}")
@ -303,55 +291,64 @@ class BootstrapDiscovery:
connection_start_time = trio.current_time()
try:
# Log connection attempt with fallback details
logger.info(
f"Attempting connection to {peer_id} (will try {len(available_addrs)} "
f"addresses with automatic fallback)"
with trio.move_on_after(self.connection_timeout):
# Log connection attempt
logger.info(
f"Attempting connection to {peer_id} using "
f"{len(available_addrs)} addresses"
)
# Log each address that will be attempted
for i, addr in enumerate(available_addrs, 1):
logger.debug(f"Address {i}: {addr}")
# Use swarm.dial_peer to connect using stored addresses
connection = await self.swarm.dial_peer(peer_id)
# Calculate connection time
connection_time = trio.current_time() - connection_start_time
# Allow other tasks to run after dial attempt
await trio.lowlevel.checkpoint()
# Post-connection validation: Verify connection was actually established
if peer_id in self.swarm.connections:
logger.info(
f"✅ Connected to {peer_id} (took {connection_time:.2f}s)"
)
# Track this connection for drop monitoring
self.connected_peers.add(peer_id)
# Start monitoring this specific connection for drops
trio.lowlevel.spawn_system_task(
self._monitor_peer_connection, peer_id
)
# Log which address was successful (if available)
if hasattr(connection, "get_transport_addresses"):
successful_addrs = connection.get_transport_addresses()
if successful_addrs:
logger.debug(f"Successful address: {successful_addrs[0]}")
else:
logger.warning(
f"Dial succeeded but connection not found for {peer_id}"
)
except trio.TooSlowError:
logger.warning(
f"❌ Connection to {peer_id} timed out after {self.connection_timeout}s"
)
# Log each address that will be attempted
for i, addr in enumerate(available_addrs, 1):
logger.debug(f"Fallback address {i}: {addr}")
# Use swarm.dial_peer - this automatically implements fallback logic:
# - Tries each address in sequence until one succeeds
# - Collects exceptions from failed attempts
# - Raises SwarmException with MultiError if all attempts fail
connection = await self.swarm.dial_peer(peer_id)
# Calculate connection time
connection_time = trio.current_time() - connection_start_time
# Allow other tasks to run after dial attempt
await trio.lowlevel.checkpoint()
# Post-connection validation: Verify connection was actually established
if peer_id in self.swarm.connections:
logger.info(f"✅ Connected to {peer_id} (took {connection_time:.2f}s)")
# Track this as a connected bootstrap peer
self.connected_bootstrap_peers.add(peer_id)
# Log which address was successful (if available)
if hasattr(connection, "get_transport_addresses"):
successful_addrs = connection.get_transport_addresses()
if successful_addrs:
logger.debug(f"Successful address: {successful_addrs[0]}")
else:
logger.warning(f"Dial succeeded but connection not found for {peer_id}")
except SwarmException as e:
# Calculate failed connection time
failed_connection_time = trio.current_time() - connection_start_time
# Enhanced error logging with fallback details
# Enhanced error logging
error_msg = str(e)
if "no addresses established a successful connection" in error_msg:
logger.warning(
f"❌ Failed to connect to {peer_id} after trying all "
f"{len(available_addrs)} addresses "
f"(took {failed_connection_time:.2f}s) - "
f"all fallback attempts failed"
f"(took {failed_connection_time:.2f}s)"
)
# Log individual address failures if this is a MultiError
if (
@ -384,117 +381,44 @@ class BootstrapDiscovery:
# Don't re-raise to prevent killing the nursery and other parallel tasks
logger.debug("Continuing with other parallel connection attempts")
async def _monitor_disconnections(self) -> None:
async def _monitor_peer_connection(self, peer_id: ID) -> None:
"""
Monitor bootstrap peer connections and immediately reconnect when they drop.
Monitor a specific peer connection for drops using event-driven detection.
This runs as a background task that efficiently detects
disconnections in real-time.
Waits for the connection to be removed from swarm.connections, which
happens when error 4101 or other connection errors occur.
"""
self._disconnect_monitor_running = True
logger.info(
"Disconnect monitor started - will reconnect "
"immediately when connections drop"
)
logger.debug(f"🔍 Started monitoring connection to {peer_id}")
try:
while True:
# Check for disconnections more frequently but efficiently
await trio.sleep(1.0) # Check every second for responsiveness
# Wait for the connection to disappear (event-driven)
while peer_id in self.swarm.connections:
await trio.sleep(0.1) # Small sleep to yield control
# Check which bootstrap peers are no longer connected
disconnected_peers = []
for peer_id in list(self.connected_bootstrap_peers):
if peer_id not in self.swarm.connections:
disconnected_peers.append(peer_id)
self.connected_bootstrap_peers.discard(peer_id)
logger.info(
f"⚠️ Detected disconnection from bootstrap peer: {peer_id}"
)
# Connection was dropped - log it immediately
if peer_id in self.connected_peers:
self.connected_peers.discard(peer_id)
logger.warning(
f"📡 Connection to {peer_id} was dropped! (detected event-driven)"
)
# Immediately reconnect to disconnected peers
if disconnected_peers:
logger.info(
f"🔄 Immediately reconnecting to {len(disconnected_peers)} "
f"disconnected bootstrap peer(s)"
)
# Reconnect in parallel for better performance
try:
async with trio.open_nursery() as reconnect_nursery:
for peer_id in disconnected_peers:
logger.info(f"🔌 Reconnecting to {peer_id}")
reconnect_nursery.start_soon(
self._reconnect_to_peer, peer_id
)
except trio.Cancelled:
logger.debug("Reconnection nursery cancelled")
raise
except Exception as e:
logger.warning(f"Reconnection nursery failed: {e}")
# Log current connection count
remaining_connections = len(self.connected_peers)
logger.info(f"📊 Remaining connected peers: {remaining_connections}")
except trio.Cancelled:
logger.info("Disconnect monitor stopped - task cancelled")
logger.debug(f"Connection monitoring for {peer_id} stopped")
except Exception as e:
logger.error(f"Unexpected error in disconnect monitor: {e}")
finally:
self._disconnect_monitor_running = False
logger.debug("Disconnect monitor task cleanup completed")
async def _reconnect_to_peer(self, peer_id: ID) -> None:
"""
Reconnect to a specific bootstrap peer with backoff on failure.
This method includes simple backoff logic to avoid overwhelming
peers that may be temporarily unavailable.
"""
max_attempts = 3
base_delay = 1.0
try:
for attempt in range(1, max_attempts + 1):
try:
logger.debug(
f"Reconnection attempt {attempt}/{max_attempts} for {peer_id}"
)
await self._connect_to_peer(peer_id)
# If we get here, connection was successful
if peer_id in self.swarm.connections:
logger.info(
f"✅ Successfully reconnected to {peer_id} on "
f"attempt {attempt}"
)
return
except Exception as e:
logger.debug(
f"Reconnection attempt {attempt} failed for {peer_id}: {e}"
)
# Wait before next attempt (exponential backoff)
if attempt < max_attempts:
delay = base_delay * (2 ** (attempt - 1)) # 1s, 2s, 4s
logger.debug(
f"Waiting {delay}s before next reconnection attempt"
)
await trio.sleep(delay)
logger.warning(
f"❌ Failed to reconnect to {peer_id} after {max_attempts} attempts"
)
except Exception as e:
# Catch any unexpected errors to prevent crashing the nursery
logger.error(f"❌ Unexpected error during reconnection to {peer_id}: {e}")
# Don't re-raise to keep other parallel reconnection tasks running
logger.error(f"Error monitoring connection to {peer_id}: {e}")
# Clean up tracking on error
self.connected_peers.discard(peer_id)
def _is_ipv4_tcp_addr(self, addr: Multiaddr) -> bool:
"""
Check if address is IPv4 with TCP protocol only.
This restricts dialing attempts to addresses that conform to IPv4+TCP,
filtering out IPv6, UDP, QUIC, WebSocket, and other unsupported protocols.
Filters out IPv6, UDP, QUIC, WebSocket, and other unsupported protocols.
Only IPv4+TCP addresses are supported by the current transport.
"""
try:
protocols = addr.protocols()