simplify bootstrap discovery with optimized timeouts

This commit is contained in:
ankur12-1610
2025-08-26 01:41:26 +05:30
parent 3b27b02a8b
commit c940dac1e6

View File

@ -17,8 +17,8 @@ class BootstrapDiscovery:
""" """
Bootstrap-based peer discovery for py-libp2p. Bootstrap-based peer discovery for py-libp2p.
Uses Trio nurseries for parallel address resolution and connection attempts. Processes bootstrap addresses in parallel and attempts initial connections.
Connects to predefined bootstrap peers and adds them to peerstore. Adds discovered peers to peerstore for network bootstrapping.
""" """
def __init__(self, swarm: INetworkService, bootstrap_addrs: list[str]): def __init__(self, swarm: INetworkService, bootstrap_addrs: list[str]):
@ -29,15 +29,15 @@ class BootstrapDiscovery:
swarm: The network service (swarm) instance swarm: The network service (swarm) instance
bootstrap_addrs: List of bootstrap peer multiaddresses bootstrap_addrs: List of bootstrap peer multiaddresses
Note: Connection maintenance is always enabled to ensure reliable connectivity.
""" """
self.swarm = swarm self.swarm = swarm
self.peerstore = swarm.peerstore self.peerstore = swarm.peerstore
self.bootstrap_addrs = bootstrap_addrs or [] self.bootstrap_addrs = bootstrap_addrs or []
self.discovered_peers: set[str] = set() self.discovered_peers: set[str] = set()
self.connected_bootstrap_peers: set[ID] = set() self.connection_timeout: int = 10
self._disconnect_monitor_running = False self.connected_peers: set[ID] = (
set()
) # Track connected peers for drop detection
async def start(self) -> None: async def start(self) -> None:
"""Process bootstrap addresses and emit peer discovery events in parallel.""" """Process bootstrap addresses and emit peer discovery events in parallel."""
@ -71,7 +71,7 @@ class BootstrapDiscovery:
# Start all bootstrap address processing tasks in parallel # Start all bootstrap address processing tasks in parallel
for addr_str in self.bootstrap_addrs: for addr_str in self.bootstrap_addrs:
logger.info(f"Starting parallel task for: {addr_str}") logger.info(f"Starting parallel task for: {addr_str}")
nursery.start_soon(self._process_bootstrap_addr_safe, addr_str) nursery.start_soon(self._process_bootstrap_addr, addr_str)
# The nursery will wait for all address processing tasks to complete # The nursery will wait for all address processing tasks to complete
logger.info( logger.info(
@ -87,20 +87,13 @@ class BootstrapDiscovery:
logger.info("Bootstrap discovery startup complete - all tasks finished") logger.info("Bootstrap discovery startup complete - all tasks finished")
# Always start disconnect monitoring for reliable connectivity
if not self._disconnect_monitor_running:
trio.lowlevel.spawn_system_task(self._monitor_disconnections)
def stop(self) -> None: def stop(self) -> None:
"""Clean up bootstrap discovery resources and stop all background tasks.""" """Clean up bootstrap discovery resources."""
logger.info("Stopping bootstrap discovery and cleaning up tasks") logger.info("Stopping bootstrap discovery and cleaning up tasks")
# Clear discovered peers # Clear discovered peers
self.discovered_peers.clear() self.discovered_peers.clear()
self.connected_bootstrap_peers.clear() self.connected_peers.clear()
# Mark disconnect monitor as stopped
self._disconnect_monitor_running = False
logger.debug("Bootstrap discovery cleanup completed") logger.debug("Bootstrap discovery cleanup completed")
@ -164,7 +157,7 @@ class BootstrapDiscovery:
logger.debug(f"Skipping own peer ID: {peer_info.peer_id}") logger.debug(f"Skipping own peer ID: {peer_info.peer_id}")
return return
# Filter addresses to only include IPv4+TCP (restrict dialing attempts) # Filter addresses to only include IPv4+TCP (only supported protocol)
ipv4_tcp_addrs = [] ipv4_tcp_addrs = []
filtered_out_addrs = [] filtered_out_addrs = []
@ -174,12 +167,9 @@ class BootstrapDiscovery:
else: else:
filtered_out_addrs.append(addr) filtered_out_addrs.append(addr)
# Log filtering results with fallback strategy details # Log filtering results
logger.info(f"Address filtering for {peer_info.peer_id}:") logger.info(f"Address filtering for {peer_info.peer_id}:")
logger.info( logger.info(f"IPv4+TCP addresses: {len(ipv4_tcp_addrs)}")
f"IPv4+TCP addresses: {len(ipv4_tcp_addrs)} "
f"(will be tried in sequence for fallback)"
)
logger.info(f"Filtered out: {len(filtered_out_addrs)} (unsupported protocols)") logger.info(f"Filtered out: {len(filtered_out_addrs)} (unsupported protocols)")
# Show filtered addresses for debugging # Show filtered addresses for debugging
@ -187,11 +177,11 @@ class BootstrapDiscovery:
for addr in filtered_out_addrs: for addr in filtered_out_addrs:
logger.debug(f"Filtered: {addr}") logger.debug(f"Filtered: {addr}")
# Show addresses that will be used for fallback # Show addresses that will be used
if ipv4_tcp_addrs: if ipv4_tcp_addrs:
logger.debug("Addresses for fallback attempts:") logger.debug("Usable addresses:")
for i, addr in enumerate(ipv4_tcp_addrs, 1): for i, addr in enumerate(ipv4_tcp_addrs, 1):
logger.debug(f" Fallback {i}: {addr}") logger.debug(f" Address {i}: {addr}")
# Skip peer if no IPv4+TCP addresses available # Skip peer if no IPv4+TCP addresses available
if not ipv4_tcp_addrs: if not ipv4_tcp_addrs:
@ -202,12 +192,10 @@ class BootstrapDiscovery:
return return
logger.info( logger.info(
f"Will attempt connection with automatic fallback through " f"Will attempt connection using {len(ipv4_tcp_addrs)} IPv4+TCP addresses"
f"{len(ipv4_tcp_addrs)} IPv4+TCP addresses"
) )
# Add only IPv4+TCP addresses to peerstore # Add only IPv4+TCP addresses to peerstore
# (restrict dialing to supported protocols)
self.peerstore.add_addrs(peer_info.peer_id, ipv4_tcp_addrs, 0) self.peerstore.add_addrs(peer_info.peer_id, ipv4_tcp_addrs, 0)
# Allow other tasks to run after adding to peerstore # Allow other tasks to run after adding to peerstore
@ -268,10 +256,10 @@ class BootstrapDiscovery:
async def _connect_to_peer(self, peer_id: ID) -> None: async def _connect_to_peer(self, peer_id: ID) -> None:
""" """
Attempt to establish a connection to a peer with fallback logic. Attempt to establish a connection to a peer with timeout.
Uses swarm.dial_peer which tries all available addresses for the peer Uses swarm.dial_peer to connect using addresses stored in peerstore.
in sequence until one succeeds or all fail. Times out after connection_timeout seconds to prevent hanging.
""" """
logger.info(f"Connection attempt for peer: {peer_id}") logger.info(f"Connection attempt for peer: {peer_id}")
@ -303,55 +291,64 @@ class BootstrapDiscovery:
connection_start_time = trio.current_time() connection_start_time = trio.current_time()
try: try:
# Log connection attempt with fallback details with trio.move_on_after(self.connection_timeout):
logger.info( # Log connection attempt
f"Attempting connection to {peer_id} (will try {len(available_addrs)} " logger.info(
f"addresses with automatic fallback)" f"Attempting connection to {peer_id} using "
f"{len(available_addrs)} addresses"
)
# Log each address that will be attempted
for i, addr in enumerate(available_addrs, 1):
logger.debug(f"Address {i}: {addr}")
# Use swarm.dial_peer to connect using stored addresses
connection = await self.swarm.dial_peer(peer_id)
# Calculate connection time
connection_time = trio.current_time() - connection_start_time
# Allow other tasks to run after dial attempt
await trio.lowlevel.checkpoint()
# Post-connection validation: Verify connection was actually established
if peer_id in self.swarm.connections:
logger.info(
f"✅ Connected to {peer_id} (took {connection_time:.2f}s)"
)
# Track this connection for drop monitoring
self.connected_peers.add(peer_id)
# Start monitoring this specific connection for drops
trio.lowlevel.spawn_system_task(
self._monitor_peer_connection, peer_id
)
# Log which address was successful (if available)
if hasattr(connection, "get_transport_addresses"):
successful_addrs = connection.get_transport_addresses()
if successful_addrs:
logger.debug(f"Successful address: {successful_addrs[0]}")
else:
logger.warning(
f"Dial succeeded but connection not found for {peer_id}"
)
except trio.TooSlowError:
logger.warning(
f"❌ Connection to {peer_id} timed out after {self.connection_timeout}s"
) )
# Log each address that will be attempted
for i, addr in enumerate(available_addrs, 1):
logger.debug(f"Fallback address {i}: {addr}")
# Use swarm.dial_peer - this automatically implements fallback logic:
# - Tries each address in sequence until one succeeds
# - Collects exceptions from failed attempts
# - Raises SwarmException with MultiError if all attempts fail
connection = await self.swarm.dial_peer(peer_id)
# Calculate connection time
connection_time = trio.current_time() - connection_start_time
# Allow other tasks to run after dial attempt
await trio.lowlevel.checkpoint()
# Post-connection validation: Verify connection was actually established
if peer_id in self.swarm.connections:
logger.info(f"✅ Connected to {peer_id} (took {connection_time:.2f}s)")
# Track this as a connected bootstrap peer
self.connected_bootstrap_peers.add(peer_id)
# Log which address was successful (if available)
if hasattr(connection, "get_transport_addresses"):
successful_addrs = connection.get_transport_addresses()
if successful_addrs:
logger.debug(f"Successful address: {successful_addrs[0]}")
else:
logger.warning(f"Dial succeeded but connection not found for {peer_id}")
except SwarmException as e: except SwarmException as e:
# Calculate failed connection time # Calculate failed connection time
failed_connection_time = trio.current_time() - connection_start_time failed_connection_time = trio.current_time() - connection_start_time
# Enhanced error logging with fallback details # Enhanced error logging
error_msg = str(e) error_msg = str(e)
if "no addresses established a successful connection" in error_msg: if "no addresses established a successful connection" in error_msg:
logger.warning( logger.warning(
f"❌ Failed to connect to {peer_id} after trying all " f"❌ Failed to connect to {peer_id} after trying all "
f"{len(available_addrs)} addresses " f"{len(available_addrs)} addresses "
f"(took {failed_connection_time:.2f}s) - " f"(took {failed_connection_time:.2f}s)"
f"all fallback attempts failed"
) )
# Log individual address failures if this is a MultiError # Log individual address failures if this is a MultiError
if ( if (
@ -384,117 +381,44 @@ class BootstrapDiscovery:
# Don't re-raise to prevent killing the nursery and other parallel tasks # Don't re-raise to prevent killing the nursery and other parallel tasks
logger.debug("Continuing with other parallel connection attempts") logger.debug("Continuing with other parallel connection attempts")
async def _monitor_disconnections(self) -> None: async def _monitor_peer_connection(self, peer_id: ID) -> None:
""" """
Monitor bootstrap peer connections and immediately reconnect when they drop. Monitor a specific peer connection for drops using event-driven detection.
This runs as a background task that efficiently detects Waits for the connection to be removed from swarm.connections, which
disconnections in real-time. happens when error 4101 or other connection errors occur.
""" """
self._disconnect_monitor_running = True logger.debug(f"🔍 Started monitoring connection to {peer_id}")
logger.info(
"Disconnect monitor started - will reconnect "
"immediately when connections drop"
)
try: try:
while True: # Wait for the connection to disappear (event-driven)
# Check for disconnections more frequently but efficiently while peer_id in self.swarm.connections:
await trio.sleep(1.0) # Check every second for responsiveness await trio.sleep(0.1) # Small sleep to yield control
# Check which bootstrap peers are no longer connected # Connection was dropped - log it immediately
disconnected_peers = [] if peer_id in self.connected_peers:
for peer_id in list(self.connected_bootstrap_peers): self.connected_peers.discard(peer_id)
if peer_id not in self.swarm.connections: logger.warning(
disconnected_peers.append(peer_id) f"📡 Connection to {peer_id} was dropped! (detected event-driven)"
self.connected_bootstrap_peers.discard(peer_id) )
logger.info(
f"⚠️ Detected disconnection from bootstrap peer: {peer_id}"
)
# Immediately reconnect to disconnected peers # Log current connection count
if disconnected_peers: remaining_connections = len(self.connected_peers)
logger.info( logger.info(f"📊 Remaining connected peers: {remaining_connections}")
f"🔄 Immediately reconnecting to {len(disconnected_peers)} "
f"disconnected bootstrap peer(s)"
)
# Reconnect in parallel for better performance
try:
async with trio.open_nursery() as reconnect_nursery:
for peer_id in disconnected_peers:
logger.info(f"🔌 Reconnecting to {peer_id}")
reconnect_nursery.start_soon(
self._reconnect_to_peer, peer_id
)
except trio.Cancelled:
logger.debug("Reconnection nursery cancelled")
raise
except Exception as e:
logger.warning(f"Reconnection nursery failed: {e}")
except trio.Cancelled: except trio.Cancelled:
logger.info("Disconnect monitor stopped - task cancelled") logger.debug(f"Connection monitoring for {peer_id} stopped")
except Exception as e: except Exception as e:
logger.error(f"Unexpected error in disconnect monitor: {e}") logger.error(f"Error monitoring connection to {peer_id}: {e}")
finally: # Clean up tracking on error
self._disconnect_monitor_running = False self.connected_peers.discard(peer_id)
logger.debug("Disconnect monitor task cleanup completed")
async def _reconnect_to_peer(self, peer_id: ID) -> None:
"""
Reconnect to a specific bootstrap peer with backoff on failure.
This method includes simple backoff logic to avoid overwhelming
peers that may be temporarily unavailable.
"""
max_attempts = 3
base_delay = 1.0
try:
for attempt in range(1, max_attempts + 1):
try:
logger.debug(
f"Reconnection attempt {attempt}/{max_attempts} for {peer_id}"
)
await self._connect_to_peer(peer_id)
# If we get here, connection was successful
if peer_id in self.swarm.connections:
logger.info(
f"✅ Successfully reconnected to {peer_id} on "
f"attempt {attempt}"
)
return
except Exception as e:
logger.debug(
f"Reconnection attempt {attempt} failed for {peer_id}: {e}"
)
# Wait before next attempt (exponential backoff)
if attempt < max_attempts:
delay = base_delay * (2 ** (attempt - 1)) # 1s, 2s, 4s
logger.debug(
f"Waiting {delay}s before next reconnection attempt"
)
await trio.sleep(delay)
logger.warning(
f"❌ Failed to reconnect to {peer_id} after {max_attempts} attempts"
)
except Exception as e:
# Catch any unexpected errors to prevent crashing the nursery
logger.error(f"❌ Unexpected error during reconnection to {peer_id}: {e}")
# Don't re-raise to keep other parallel reconnection tasks running
def _is_ipv4_tcp_addr(self, addr: Multiaddr) -> bool: def _is_ipv4_tcp_addr(self, addr: Multiaddr) -> bool:
""" """
Check if address is IPv4 with TCP protocol only. Check if address is IPv4 with TCP protocol only.
This restricts dialing attempts to addresses that conform to IPv4+TCP, Filters out IPv6, UDP, QUIC, WebSocket, and other unsupported protocols.
filtering out IPv6, UDP, QUIC, WebSocket, and other unsupported protocols. Only IPv4+TCP addresses are supported by the current transport.
""" """
try: try:
protocols = addr.protocols() protocols = addr.protocols()