Files
py-libp2p/libp2p/discovery/bootstrap/bootstrap.py
2025-08-26 01:42:25 +05:30

441 lines
18 KiB
Python

import logging
from multiaddr import Multiaddr
from multiaddr.resolvers import DNSResolver
import trio
from libp2p.abc import ID, INetworkService, PeerInfo
from libp2p.discovery.events.peerDiscovery import peerDiscovery
from libp2p.network.exceptions import SwarmException
from libp2p.peer.peerinfo import info_from_p2p_addr
logger = logging.getLogger("libp2p.discovery.bootstrap")
resolver = DNSResolver()
class BootstrapDiscovery:
"""
Bootstrap-based peer discovery for py-libp2p.
Processes bootstrap addresses in parallel and attempts initial connections.
Adds discovered peers to peerstore for network bootstrapping.
"""
def __init__(self, swarm: INetworkService, bootstrap_addrs: list[str]):
"""
Initialize BootstrapDiscovery.
Args:
swarm: The network service (swarm) instance
bootstrap_addrs: List of bootstrap peer multiaddresses
"""
self.swarm = swarm
self.peerstore = swarm.peerstore
self.bootstrap_addrs = bootstrap_addrs or []
self.discovered_peers: set[str] = set()
self.connection_timeout: int = 10
self.connected_peers: set[ID] = (
set()
) # Track connected peers for drop detection
async def start(self) -> None:
"""Process bootstrap addresses and emit peer discovery events in parallel."""
logger.info(
f"Starting bootstrap discovery with "
f"{len(self.bootstrap_addrs)} bootstrap addresses"
)
# Show all bootstrap addresses being processed
for i, addr in enumerate(self.bootstrap_addrs):
logger.info(f"{i + 1}. {addr}")
# Allow other tasks to run
await trio.lowlevel.checkpoint()
# Validate and filter bootstrap addresses
# self.bootstrap_addrs = validate_bootstrap_addresses(self.bootstrap_addrs)
logger.info(f"Valid addresses after validation: {len(self.bootstrap_addrs)}")
# Allow other tasks to run after validation
await trio.lowlevel.checkpoint()
# Use Trio nursery for PARALLEL address processing
try:
async with trio.open_nursery() as nursery:
logger.info(
f"Starting {len(self.bootstrap_addrs)} parallel address "
f"processing tasks"
)
# Start all bootstrap address processing tasks in parallel
for addr_str in self.bootstrap_addrs:
logger.info(f"Starting parallel task for: {addr_str}")
nursery.start_soon(self._process_bootstrap_addr, addr_str)
# The nursery will wait for all address processing tasks to complete
logger.info(
"Nursery active - waiting for address processing tasks to complete"
)
except trio.Cancelled:
logger.info("Bootstrap address processing cancelled - cleaning up tasks")
raise
except Exception as e:
logger.error(f"Bootstrap address processing failed: {e}")
raise
logger.info("Bootstrap discovery startup complete - all tasks finished")
def stop(self) -> None:
"""Clean up bootstrap discovery resources."""
logger.info("Stopping bootstrap discovery and cleaning up tasks")
# Clear discovered peers
self.discovered_peers.clear()
self.connected_peers.clear()
logger.debug("Bootstrap discovery cleanup completed")
async def _process_bootstrap_addr_safe(self, addr_str: str) -> None:
"""Safely process a bootstrap address with exception handling."""
try:
await self._process_bootstrap_addr(addr_str)
except Exception as e:
logger.warning(f"Failed to process bootstrap address {addr_str}: {e}")
# Ensure task cleanup and continue processing other addresses
async def _process_bootstrap_addr(self, addr_str: str) -> None:
"""Convert string address to PeerInfo and add to peerstore."""
try:
multiaddr = Multiaddr(addr_str)
except Exception as e:
logger.debug(f"Invalid multiaddr format '{addr_str}': {e}")
return
if self.is_dns_addr(multiaddr):
# Allow other tasks to run during DNS resolution
await trio.lowlevel.checkpoint()
resolved_addrs = await resolver.resolve(multiaddr)
if resolved_addrs is None:
logger.warning(f"DNS resolution returned None for: {addr_str}")
return
# Allow other tasks to run after DNS resolution
await trio.lowlevel.checkpoint()
peer_id_str = multiaddr.get_peer_id()
if peer_id_str is None:
logger.warning(f"Missing peer ID in DNS address: {addr_str}")
return
peer_id = ID.from_base58(peer_id_str)
addrs = [addr for addr in resolved_addrs]
if not addrs:
logger.warning(f"No addresses resolved for DNS address: {addr_str}")
return
peer_info = PeerInfo(peer_id, addrs)
await self.add_addr(peer_info)
else:
peer_info = info_from_p2p_addr(multiaddr)
await self.add_addr(peer_info)
def is_dns_addr(self, addr: Multiaddr) -> bool:
"""Check if the address is a DNS address."""
return any(protocol.name == "dnsaddr" for protocol in addr.protocols())
async def add_addr(self, peer_info: PeerInfo) -> None:
"""
Add a peer to the peerstore, emit discovery event,
and attempt connection in parallel.
"""
logger.info(f"Adding peer to peerstore: {peer_info.peer_id}")
logger.info(f"Total addresses received: {len(peer_info.addrs)}")
# Skip if it's our own peer
if peer_info.peer_id == self.swarm.get_peer_id():
logger.debug(f"Skipping own peer ID: {peer_info.peer_id}")
return
# Filter addresses to only include IPv4+TCP (only supported protocol)
ipv4_tcp_addrs = []
filtered_out_addrs = []
for addr in peer_info.addrs:
if self._is_ipv4_tcp_addr(addr):
ipv4_tcp_addrs.append(addr)
else:
filtered_out_addrs.append(addr)
# Log filtering results
logger.info(f"Address filtering for {peer_info.peer_id}:")
logger.info(f"IPv4+TCP addresses: {len(ipv4_tcp_addrs)}")
logger.info(f"Filtered out: {len(filtered_out_addrs)} (unsupported protocols)")
# Show filtered addresses for debugging
if filtered_out_addrs:
for addr in filtered_out_addrs:
logger.debug(f"Filtered: {addr}")
# Show addresses that will be used
if ipv4_tcp_addrs:
logger.debug("Usable addresses:")
for i, addr in enumerate(ipv4_tcp_addrs, 1):
logger.debug(f" Address {i}: {addr}")
# Skip peer if no IPv4+TCP addresses available
if not ipv4_tcp_addrs:
logger.warning(
f"❌ No IPv4+TCP addresses for {peer_info.peer_id} - "
f"skipping connection attempts"
)
return
logger.info(
f"Will attempt connection using {len(ipv4_tcp_addrs)} IPv4+TCP addresses"
)
# Add only IPv4+TCP addresses to peerstore
self.peerstore.add_addrs(peer_info.peer_id, ipv4_tcp_addrs, 0)
# Allow other tasks to run after adding to peerstore
await trio.lowlevel.checkpoint()
# Verify addresses were added
stored_addrs = self.peerstore.addrs(peer_info.peer_id)
logger.info(f"Addresses stored in peerstore: {len(stored_addrs)} addresses")
# Only emit discovery event if this is the first time we see this peer
peer_id_str = str(peer_info.peer_id)
if peer_id_str not in self.discovered_peers:
# Track discovered peer
self.discovered_peers.add(peer_id_str)
# Emit peer discovery event
peerDiscovery.emit_peer_discovered(peer_info)
logger.debug(f"Peer discovered: {peer_info.peer_id}")
# Use nursery for parallel connection attempt (non-blocking)
try:
async with trio.open_nursery() as connection_nursery:
logger.info("Starting parallel connection attempt...")
connection_nursery.start_soon(
self._connect_to_peer, peer_info.peer_id
)
except trio.Cancelled:
logger.debug(f"Connection attempt cancelled for {peer_info.peer_id}")
raise
except Exception as e:
logger.warning(
f"Connection nursery failed for {peer_info.peer_id}: {e}"
)
else:
logger.debug(
f"Additional addresses added for existing peer: {peer_info.peer_id}"
)
# Even for existing peers, try to connect if not already connected
if peer_info.peer_id not in self.swarm.connections:
logger.info("Starting parallel connection attempt for existing peer...")
# Use nursery for parallel connection attempt (non-blocking)
try:
async with trio.open_nursery() as connection_nursery:
connection_nursery.start_soon(
self._connect_to_peer, peer_info.peer_id
)
except trio.Cancelled:
logger.debug(
f"Connection attempt cancelled for existing peer "
f"{peer_info.peer_id}"
)
raise
except Exception as e:
logger.warning(
f"Connection nursery failed for existing peer "
f"{peer_info.peer_id}: {e}"
)
async def _connect_to_peer(self, peer_id: ID) -> None:
"""
Attempt to establish a connection to a peer with timeout.
Uses swarm.dial_peer to connect using addresses stored in peerstore.
Times out after connection_timeout seconds to prevent hanging.
"""
logger.info(f"Connection attempt for peer: {peer_id}")
# Pre-connection validation: Check if already connected
if peer_id in self.swarm.connections:
logger.debug(
f"Already connected to {peer_id} - skipping connection attempt"
)
return
# Allow other tasks to run before connection attempt
await trio.lowlevel.checkpoint()
# Check available addresses before attempting connection
available_addrs = self.peerstore.addrs(peer_id)
logger.info(
f"Available addresses for {peer_id}: {len(available_addrs)} addresses"
)
# Log all available addresses for transparency
for i, addr in enumerate(available_addrs, 1):
logger.debug(f" Address {i}: {addr}")
if not available_addrs:
logger.error(f"❌ No addresses available for {peer_id} - cannot connect")
return
# Record start time for connection attempt monitoring
connection_start_time = trio.current_time()
try:
with trio.move_on_after(self.connection_timeout):
# Log connection attempt
logger.info(
f"Attempting connection to {peer_id} using "
f"{len(available_addrs)} addresses"
)
# Log each address that will be attempted
for i, addr in enumerate(available_addrs, 1):
logger.debug(f"Address {i}: {addr}")
# Use swarm.dial_peer to connect using stored addresses
connection = await self.swarm.dial_peer(peer_id)
# Calculate connection time
connection_time = trio.current_time() - connection_start_time
# Allow other tasks to run after dial attempt
await trio.lowlevel.checkpoint()
# Post-connection validation: Verify connection was actually established
if peer_id in self.swarm.connections:
logger.info(
f"✅ Connected to {peer_id} (took {connection_time:.2f}s)"
)
# Track this connection for drop monitoring
self.connected_peers.add(peer_id)
# Start monitoring this specific connection for drops
trio.lowlevel.spawn_system_task(
self._monitor_peer_connection, peer_id
)
# Log which address was successful (if available)
if hasattr(connection, "get_transport_addresses"):
successful_addrs = connection.get_transport_addresses()
if successful_addrs:
logger.debug(f"Successful address: {successful_addrs[0]}")
else:
logger.warning(
f"Dial succeeded but connection not found for {peer_id}"
)
except trio.TooSlowError:
logger.warning(
f"❌ Connection to {peer_id} timed out after {self.connection_timeout}s"
)
except SwarmException as e:
# Calculate failed connection time
failed_connection_time = trio.current_time() - connection_start_time
# Enhanced error logging
error_msg = str(e)
if "no addresses established a successful connection" in error_msg:
logger.warning(
f"❌ Failed to connect to {peer_id} after trying all "
f"{len(available_addrs)} addresses "
f"(took {failed_connection_time:.2f}s)"
)
# Log individual address failures if this is a MultiError
if (
e.__cause__ is not None
and hasattr(e.__cause__, "exceptions")
and getattr(e.__cause__, "exceptions", None) is not None
):
exceptions_list = getattr(e.__cause__, "exceptions")
logger.info("📋 Individual address failure details:")
for i, addr_exception in enumerate(exceptions_list, 1):
logger.info(f"Address {i}: {addr_exception}")
# Also log the actual address that failed
if i <= len(available_addrs):
logger.info(f"Failed address: {available_addrs[i - 1]}")
else:
logger.warning("No detailed exception information available")
else:
logger.warning(
f"❌ Failed to connect to {peer_id}: {e} "
f"(took {failed_connection_time:.2f}s)"
)
except Exception as e:
# Handle unexpected errors that aren't swarm-specific
failed_connection_time = trio.current_time() - connection_start_time
logger.error(
f"❌ Unexpected error connecting to {peer_id}: "
f"{e} (took {failed_connection_time:.2f}s)"
)
# Don't re-raise to prevent killing the nursery and other parallel tasks
logger.debug("Continuing with other parallel connection attempts")
async def _monitor_peer_connection(self, peer_id: ID) -> None:
"""
Monitor a specific peer connection for drops using event-driven detection.
Waits for the connection to be removed from swarm.connections, which
happens when error 4101 or other connection errors occur.
"""
logger.debug(f"🔍 Started monitoring connection to {peer_id}")
try:
# Wait for the connection to disappear (event-driven)
while peer_id in self.swarm.connections:
await trio.sleep(0.1) # Small sleep to yield control
# Connection was dropped - log it immediately
if peer_id in self.connected_peers:
self.connected_peers.discard(peer_id)
logger.warning(
f"📡 Connection to {peer_id} was dropped! (detected event-driven)"
)
# Log current connection count
remaining_connections = len(self.connected_peers)
logger.info(f"📊 Remaining connected peers: {remaining_connections}")
except trio.Cancelled:
logger.debug(f"Connection monitoring for {peer_id} stopped")
except Exception as e:
logger.error(f"Error monitoring connection to {peer_id}: {e}")
# Clean up tracking on error
self.connected_peers.discard(peer_id)
def _is_ipv4_tcp_addr(self, addr: Multiaddr) -> bool:
"""
Check if address is IPv4 with TCP protocol only.
Filters out IPv6, UDP, QUIC, WebSocket, and other unsupported protocols.
Only IPv4+TCP addresses are supported by the current transport.
"""
try:
protocols = addr.protocols()
# Must have IPv4 protocol
has_ipv4 = any(p.name == "ip4" for p in protocols)
if not has_ipv4:
return False
# Must have TCP protocol
has_tcp = any(p.name == "tcp" for p in protocols)
if not has_tcp:
return False
return True
except Exception:
# If we can't parse the address, don't use it
return False