Add ip4 and tcp address resolution and fallback connection attempts

This commit is contained in:
ankur12-1610
2025-08-22 01:44:53 +05:30
parent 8d9b7f413d
commit 5a2fca32a0

View File

@ -1,11 +1,10 @@
import logging import logging
import trio
from multiaddr import Multiaddr from multiaddr import Multiaddr
from multiaddr.resolvers import DNSResolver from multiaddr.resolvers import DNSResolver
import trio
from libp2p.abc import ID, INetworkService, PeerInfo from libp2p.abc import ID, INetworkService, PeerInfo
from libp2p.discovery.bootstrap.utils import validate_bootstrap_addresses
from libp2p.discovery.events.peerDiscovery import peerDiscovery from libp2p.discovery.events.peerDiscovery import peerDiscovery
from libp2p.network.exceptions import SwarmException from libp2p.network.exceptions import SwarmException
from libp2p.peer.peerinfo import info_from_p2p_addr from libp2p.peer.peerinfo import info_from_p2p_addr
@ -17,27 +16,39 @@ resolver = DNSResolver()
class BootstrapDiscovery: class BootstrapDiscovery:
""" """
Bootstrap-based peer discovery for py-libp2p. Bootstrap-based peer discovery for py-libp2p.
Uses Trio nurseries for parallel address resolution and connection attempts. Uses Trio nurseries for parallel address resolution and connection attempts.
Connects to predefined bootstrap peers and adds them to peerstore. Connects to predefined bootstrap peers and adds them to peerstore.
""" """
def __init__(self, swarm: INetworkService, bootstrap_addrs: list[str]): def __init__(self, swarm: INetworkService, bootstrap_addrs: list[str]):
"""
Initialize BootstrapDiscovery.
Args:
swarm: The network service (swarm) instance
bootstrap_addrs: List of bootstrap peer multiaddresses
Note: Connection maintenance is always enabled to ensure reliable connectivity.
"""
self.swarm = swarm self.swarm = swarm
self.peerstore = swarm.peerstore self.peerstore = swarm.peerstore
self.bootstrap_addrs = bootstrap_addrs or [] self.bootstrap_addrs = bootstrap_addrs or []
self.discovered_peers: set[str] = set() self.discovered_peers: set[str] = set()
self.connected_bootstrap_peers: set[ID] = set()
self._disconnect_monitor_running = False
async def start(self) -> None: async def start(self) -> None:
"""Process bootstrap addresses and emit peer discovery events in parallel.""" """Process bootstrap addresses and emit peer discovery events in parallel."""
logger.info( logger.info(
f"🚀 Starting bootstrap discovery with " f"Starting bootstrap discovery with "
f"{len(self.bootstrap_addrs)} bootstrap addresses" f"{len(self.bootstrap_addrs)} bootstrap addresses"
) )
# Show all bootstrap addresses being processed # Show all bootstrap addresses being processed
for i, addr in enumerate(self.bootstrap_addrs): for i, addr in enumerate(self.bootstrap_addrs):
logger.info(f"{i+1}. {addr}") logger.info(f"{i + 1}. {addr}")
# Allow other tasks to run # Allow other tasks to run
await trio.lowlevel.checkpoint() await trio.lowlevel.checkpoint()
@ -50,30 +61,56 @@ class BootstrapDiscovery:
await trio.lowlevel.checkpoint() await trio.lowlevel.checkpoint()
# Use Trio nursery for PARALLEL address processing # Use Trio nursery for PARALLEL address processing
async with trio.open_nursery() as nursery: try:
logger.info(f"Starting {len(self.bootstrap_addrs)} parallel address processing tasks") async with trio.open_nursery() as nursery:
logger.info(
# Start all bootstrap address processing tasks in parallel f"Starting {len(self.bootstrap_addrs)} parallel address "
for addr_str in self.bootstrap_addrs: f"processing tasks"
logger.info(f"Starting parallel task for: {addr_str}") )
nursery.start_soon(self._process_bootstrap_addr_safe, addr_str)
# Start all bootstrap address processing tasks in parallel
# The nursery will wait for all address processing tasks to complete for addr_str in self.bootstrap_addrs:
logger.info("⏳ Nursery active - waiting for address processing tasks to complete") logger.info(f"Starting parallel task for: {addr_str}")
nursery.start_soon(self._process_bootstrap_addr_safe, addr_str)
logger.info("✅ Bootstrap discovery startup complete - all tasks finished")
# The nursery will wait for all address processing tasks to complete
logger.info(
"Nursery active - waiting for address processing tasks to complete"
)
except trio.Cancelled:
logger.info("Bootstrap address processing cancelled - cleaning up tasks")
raise
except Exception as e:
logger.error(f"Bootstrap address processing failed: {e}")
raise
logger.info("Bootstrap discovery startup complete - all tasks finished")
# Always start disconnect monitoring for reliable connectivity
if not self._disconnect_monitor_running:
trio.lowlevel.spawn_system_task(self._monitor_disconnections)
def stop(self) -> None: def stop(self) -> None:
"""Clean up bootstrap discovery resources.""" """Clean up bootstrap discovery resources and stop all background tasks."""
logger.debug("Stopping bootstrap discovery") logger.info("Stopping bootstrap discovery and cleaning up tasks")
# Clear discovered peers
self.discovered_peers.clear() self.discovered_peers.clear()
self.connected_bootstrap_peers.clear()
# Mark disconnect monitor as stopped
self._disconnect_monitor_running = False
logger.debug("Bootstrap discovery cleanup completed")
async def _process_bootstrap_addr_safe(self, addr_str: str) -> None: async def _process_bootstrap_addr_safe(self, addr_str: str) -> None:
"""Safely process a bootstrap address with exception handling.""" """Safely process a bootstrap address with exception handling."""
try: try:
await self._process_bootstrap_addr(addr_str) await self._process_bootstrap_addr(addr_str)
except Exception as e: except Exception as e:
logger.debug(f"Failed to process bootstrap address {addr_str}: {e}") logger.warning(f"Failed to process bootstrap address {addr_str}: {e}")
# Ensure task cleanup and continue processing other addresses
async def _process_bootstrap_addr(self, addr_str: str) -> None: async def _process_bootstrap_addr(self, addr_str: str) -> None:
"""Convert string address to PeerInfo and add to peerstore.""" """Convert string address to PeerInfo and add to peerstore."""
@ -82,19 +119,19 @@ class BootstrapDiscovery:
except Exception as e: except Exception as e:
logger.debug(f"Invalid multiaddr format '{addr_str}': {e}") logger.debug(f"Invalid multiaddr format '{addr_str}': {e}")
return return
if self.is_dns_addr(multiaddr): if self.is_dns_addr(multiaddr):
# Allow other tasks to run during DNS resolution # Allow other tasks to run during DNS resolution
await trio.lowlevel.checkpoint() await trio.lowlevel.checkpoint()
resolved_addrs = await resolver.resolve(multiaddr) resolved_addrs = await resolver.resolve(multiaddr)
if resolved_addrs is None: if resolved_addrs is None:
logger.warning(f"DNS resolution returned None for: {addr_str}") logger.warning(f"DNS resolution returned None for: {addr_str}")
return return
# Allow other tasks to run after DNS resolution # Allow other tasks to run after DNS resolution
await trio.lowlevel.checkpoint() await trio.lowlevel.checkpoint()
peer_id_str = multiaddr.get_peer_id() peer_id_str = multiaddr.get_peer_id()
if peer_id_str is None: if peer_id_str is None:
logger.warning(f"Missing peer ID in DNS address: {addr_str}") logger.warning(f"Missing peer ID in DNS address: {addr_str}")
@ -115,24 +152,70 @@ class BootstrapDiscovery:
return any(protocol.name == "dnsaddr" for protocol in addr.protocols()) return any(protocol.name == "dnsaddr" for protocol in addr.protocols())
async def add_addr(self, peer_info: PeerInfo) -> None: async def add_addr(self, peer_info: PeerInfo) -> None:
"""Add a peer to the peerstore, emit discovery event, and attempt connection in parallel.""" """
logger.info(f"📥 Adding peer to peerstore: {peer_info.peer_id}") Add a peer to the peerstore, emit discovery event,
logger.info(f"📍 Total addresses received: {len(peer_info.addrs)}") and attempt connection in parallel.
"""
logger.info(f"Adding peer to peerstore: {peer_info.peer_id}")
logger.info(f"Total addresses received: {len(peer_info.addrs)}")
# Skip if it's our own peer # Skip if it's our own peer
if peer_info.peer_id == self.swarm.get_peer_id(): if peer_info.peer_id == self.swarm.get_peer_id():
logger.debug(f"Skipping own peer ID: {peer_info.peer_id}") logger.debug(f"Skipping own peer ID: {peer_info.peer_id}")
return return
# Always add addresses to peerstore with TTL=0 (no expiration) # Filter addresses to only include IPv4+TCP (restrict dialing attempts)
self.peerstore.add_addrs(peer_info.peer_id, peer_info.addrs, 0) ipv4_tcp_addrs = []
filtered_out_addrs = []
for addr in peer_info.addrs:
if self._is_ipv4_tcp_addr(addr):
ipv4_tcp_addrs.append(addr)
else:
filtered_out_addrs.append(addr)
# Log filtering results with fallback strategy details
logger.info(f"Address filtering for {peer_info.peer_id}:")
logger.info(
f"IPv4+TCP addresses: {len(ipv4_tcp_addrs)} "
f"(will be tried in sequence for fallback)"
)
logger.info(f"Filtered out: {len(filtered_out_addrs)} (unsupported protocols)")
# Show filtered addresses for debugging
if filtered_out_addrs:
for addr in filtered_out_addrs:
logger.debug(f"Filtered: {addr}")
# Show addresses that will be used for fallback
if ipv4_tcp_addrs:
logger.debug("Addresses for fallback attempts:")
for i, addr in enumerate(ipv4_tcp_addrs, 1):
logger.debug(f" Fallback {i}: {addr}")
# Skip peer if no IPv4+TCP addresses available
if not ipv4_tcp_addrs:
logger.warning(
f"❌ No IPv4+TCP addresses for {peer_info.peer_id} - "
f"skipping connection attempts"
)
return
logger.info(
f"Will attempt connection with automatic fallback through "
f"{len(ipv4_tcp_addrs)} IPv4+TCP addresses"
)
# Add only IPv4+TCP addresses to peerstore
# (restrict dialing to supported protocols)
self.peerstore.add_addrs(peer_info.peer_id, ipv4_tcp_addrs, 0)
# Allow other tasks to run after adding to peerstore # Allow other tasks to run after adding to peerstore
await trio.lowlevel.checkpoint() await trio.lowlevel.checkpoint()
# Verify addresses were added # Verify addresses were added
stored_addrs = self.peerstore.addrs(peer_info.peer_id) stored_addrs = self.peerstore.addrs(peer_info.peer_id)
logger.info(f"Addresses stored in peerstore: {len(stored_addrs)} addresses") logger.info(f"Addresses stored in peerstore: {len(stored_addrs)} addresses")
# Only emit discovery event if this is the first time we see this peer # Only emit discovery event if this is the first time we see this peer
peer_id_str = str(peer_info.peer_id) peer_id_str = str(peer_info.peer_id)
@ -143,24 +226,55 @@ class BootstrapDiscovery:
peerDiscovery.emit_peer_discovered(peer_info) peerDiscovery.emit_peer_discovered(peer_info)
logger.debug(f"Peer discovered: {peer_info.peer_id}") logger.debug(f"Peer discovered: {peer_info.peer_id}")
# Use nursery for parallel connection attempt # Use nursery for parallel connection attempt (non-blocking)
async with trio.open_nursery() as connection_nursery: try:
logger.info(f" 🔌 Starting parallel connection attempt...") async with trio.open_nursery() as connection_nursery:
connection_nursery.start_soon(self._connect_to_peer, peer_info.peer_id) logger.info("Starting parallel connection attempt...")
connection_nursery.start_soon(
self._connect_to_peer, peer_info.peer_id
)
except trio.Cancelled:
logger.debug(f"Connection attempt cancelled for {peer_info.peer_id}")
raise
except Exception as e:
logger.warning(
f"Connection nursery failed for {peer_info.peer_id}: {e}"
)
else: else:
logger.debug(f"🔄 Additional addresses added for existing peer: {peer_info.peer_id}") logger.debug(
f"Additional addresses added for existing peer: {peer_info.peer_id}"
)
# Even for existing peers, try to connect if not already connected # Even for existing peers, try to connect if not already connected
if peer_info.peer_id not in self.swarm.connections: if peer_info.peer_id not in self.swarm.connections:
logger.info(f"🔌 Starting parallel connection attempt for existing peer...") logger.info("Starting parallel connection attempt for existing peer...")
# Use nursery for parallel connection # Use nursery for parallel connection attempt (non-blocking)
async with trio.open_nursery() as connection_nursery: try:
connection_nursery.start_soon(self._connect_to_peer, peer_info.peer_id) async with trio.open_nursery() as connection_nursery:
connection_nursery.start_soon(
self._connect_to_peer, peer_info.peer_id
)
except trio.Cancelled:
logger.debug(
f"Connection attempt cancelled for existing peer "
f"{peer_info.peer_id}"
)
raise
except Exception as e:
logger.warning(
f"Connection nursery failed for existing peer "
f"{peer_info.peer_id}: {e}"
)
async def _connect_to_peer(self, peer_id: ID) -> None: async def _connect_to_peer(self, peer_id: ID) -> None:
"""Attempt to establish a connection to a peer using swarm.dial_peer.""" """
logger.info(f"🔌 Connection attempt for peer: {peer_id}") Attempt to establish a connection to a peer with fallback logic.
Uses swarm.dial_peer which tries all available addresses for the peer
in sequence until one succeeds or all fail.
"""
logger.info(f"Connection attempt for peer: {peer_id}")
# Pre-connection validation: Check if already connected # Pre-connection validation: Check if already connected
if peer_id in self.swarm.connections: if peer_id in self.swarm.connections:
logger.debug( logger.debug(
@ -173,33 +287,230 @@ class BootstrapDiscovery:
# Check available addresses before attempting connection # Check available addresses before attempting connection
available_addrs = self.peerstore.addrs(peer_id) available_addrs = self.peerstore.addrs(peer_id)
logger.info(f"📍 Available addresses for {peer_id}: {len(available_addrs)} addresses") logger.info(
f"Available addresses for {peer_id}: {len(available_addrs)} addresses"
)
# Log all available addresses for transparency
for i, addr in enumerate(available_addrs, 1):
logger.debug(f" Address {i}: {addr}")
if not available_addrs: if not available_addrs:
logger.error(f"❌ No addresses available for {peer_id} - cannot connect") logger.error(f"❌ No addresses available for {peer_id} - cannot connect")
return return
try: # Record start time for connection attempt monitoring
# Log connection attempt for monitoring and debugging connection_start_time = trio.current_time()
logger.debug(f"Attempting to connect to {peer_id}")
try:
# Log connection attempt with fallback details
logger.info(
f"Attempting connection to {peer_id} (will try {len(available_addrs)} "
f"addresses with automatic fallback)"
)
# Log each address that will be attempted
for i, addr in enumerate(available_addrs, 1):
logger.debug(f"Fallback address {i}: {addr}")
# Use swarm.dial_peer - this automatically implements fallback logic:
# - Tries each address in sequence until one succeeds
# - Collects exceptions from failed attempts
# - Raises SwarmException with MultiError if all attempts fail
connection = await self.swarm.dial_peer(peer_id)
# Calculate connection time
connection_time = trio.current_time() - connection_start_time
# Use swarm.dial_peer to establish connection
await self.swarm.dial_peer(peer_id)
# Allow other tasks to run after dial attempt # Allow other tasks to run after dial attempt
await trio.lowlevel.checkpoint() await trio.lowlevel.checkpoint()
# Post-connection validation: Verify connection was actually established # Post-connection validation: Verify connection was actually established
if peer_id in self.swarm.connections: if peer_id in self.swarm.connections:
logger.info(f"Connected to {peer_id}") logger.info(f"Connected to {peer_id} (took {connection_time:.2f}s)")
# Track this as a connected bootstrap peer
self.connected_bootstrap_peers.add(peer_id)
# Log which address was successful (if available)
if hasattr(connection, "get_transport_addresses"):
successful_addrs = connection.get_transport_addresses()
if successful_addrs:
logger.debug(f"Successful address: {successful_addrs[0]}")
else: else:
logger.warning(f"Dial succeeded but connection not found for {peer_id}") logger.warning(f"Dial succeeded but connection not found for {peer_id}")
except SwarmException as e: except SwarmException as e:
# Handle swarm-level connection errors # Calculate failed connection time
logger.warning(f"Failed to connect to {peer_id}: {e}") failed_connection_time = trio.current_time() - connection_start_time
# Enhanced error logging with fallback details
error_msg = str(e)
if "no addresses established a successful connection" in error_msg:
logger.warning(
f"❌ Failed to connect to {peer_id} after trying all "
f"{len(available_addrs)} addresses "
f"(took {failed_connection_time:.2f}s) - "
f"all fallback attempts failed"
)
# Log individual address failures if this is a MultiError
if (
e.__cause__ is not None
and hasattr(e.__cause__, "exceptions")
and getattr(e.__cause__, "exceptions", None) is not None
):
exceptions_list = getattr(e.__cause__, "exceptions")
logger.info("📋 Individual address failure details:")
for i, addr_exception in enumerate(exceptions_list, 1):
logger.info(f"Address {i}: {addr_exception}")
# Also log the actual address that failed
if i <= len(available_addrs):
logger.info(f"Failed address: {available_addrs[i - 1]}")
else:
logger.warning("No detailed exception information available")
else:
logger.warning(
f"❌ Failed to connect to {peer_id}: {e} "
f"(took {failed_connection_time:.2f}s)"
)
except Exception as e: except Exception as e:
# Handle unexpected errors that aren't swarm-specific # Handle unexpected errors that aren't swarm-specific
logger.error(f"Unexpected error connecting to {peer_id}: {e}") failed_connection_time = trio.current_time() - connection_start_time
raise logger.error(
f"❌ Unexpected error connecting to {peer_id}: "
f"{e} (took {failed_connection_time:.2f}s)"
)
# Don't re-raise to prevent killing the nursery and other parallel tasks
logger.debug("Continuing with other parallel connection attempts")
async def _monitor_disconnections(self) -> None:
"""
Monitor bootstrap peer connections and immediately reconnect when they drop.
This runs as a background task that efficiently detects
disconnections in real-time.
"""
self._disconnect_monitor_running = True
logger.info(
"Disconnect monitor started - will reconnect "
"immediately when connections drop"
)
try:
while True:
# Check for disconnections more frequently but efficiently
await trio.sleep(1.0) # Check every second for responsiveness
# Check which bootstrap peers are no longer connected
disconnected_peers = []
for peer_id in list(self.connected_bootstrap_peers):
if peer_id not in self.swarm.connections:
disconnected_peers.append(peer_id)
self.connected_bootstrap_peers.discard(peer_id)
logger.info(
f"⚠️ Detected disconnection from bootstrap peer: {peer_id}"
)
# Immediately reconnect to disconnected peers
if disconnected_peers:
logger.info(
f"🔄 Immediately reconnecting to {len(disconnected_peers)} "
f"disconnected bootstrap peer(s)"
)
# Reconnect in parallel for better performance
try:
async with trio.open_nursery() as reconnect_nursery:
for peer_id in disconnected_peers:
logger.info(f"🔌 Reconnecting to {peer_id}")
reconnect_nursery.start_soon(
self._reconnect_to_peer, peer_id
)
except trio.Cancelled:
logger.debug("Reconnection nursery cancelled")
raise
except Exception as e:
logger.warning(f"Reconnection nursery failed: {e}")
except trio.Cancelled:
logger.info("Disconnect monitor stopped - task cancelled")
except Exception as e:
logger.error(f"Unexpected error in disconnect monitor: {e}")
finally:
self._disconnect_monitor_running = False
logger.debug("Disconnect monitor task cleanup completed")
async def _reconnect_to_peer(self, peer_id: ID) -> None:
"""
Reconnect to a specific bootstrap peer with backoff on failure.
This method includes simple backoff logic to avoid overwhelming
peers that may be temporarily unavailable.
"""
max_attempts = 3
base_delay = 1.0
try:
for attempt in range(1, max_attempts + 1):
try:
logger.debug(
f"Reconnection attempt {attempt}/{max_attempts} for {peer_id}"
)
await self._connect_to_peer(peer_id)
# If we get here, connection was successful
if peer_id in self.swarm.connections:
logger.info(
f"✅ Successfully reconnected to {peer_id} on "
f"attempt {attempt}"
)
return
except Exception as e:
logger.debug(
f"Reconnection attempt {attempt} failed for {peer_id}: {e}"
)
# Wait before next attempt (exponential backoff)
if attempt < max_attempts:
delay = base_delay * (2 ** (attempt - 1)) # 1s, 2s, 4s
logger.debug(
f"Waiting {delay}s before next reconnection attempt"
)
await trio.sleep(delay)
logger.warning(
f"❌ Failed to reconnect to {peer_id} after {max_attempts} attempts"
)
except Exception as e:
# Catch any unexpected errors to prevent crashing the nursery
logger.error(f"❌ Unexpected error during reconnection to {peer_id}: {e}")
# Don't re-raise to keep other parallel reconnection tasks running
def _is_ipv4_tcp_addr(self, addr: Multiaddr) -> bool:
"""
Check if address is IPv4 with TCP protocol only.
This restricts dialing attempts to addresses that conform to IPv4+TCP,
filtering out IPv6, UDP, QUIC, WebSocket, and other unsupported protocols.
"""
try:
protocols = addr.protocols()
# Must have IPv4 protocol
has_ipv4 = any(p.name == "ip4" for p in protocols)
if not has_ipv4:
return False
# Must have TCP protocol
has_tcp = any(p.name == "tcp" for p in protocols)
if not has_tcp:
return False
return True
except Exception:
# If we can't parse the address, don't use it
return False