\contentsline {section}{\numberline {1}Introduction}{5} \contentsline {subsection}{\numberline {1.1}Motivation}{5} \contentsline {subsection}{\numberline {1.2}Task Formulation}{5} \contentsline {section}{\numberline {2}Congestion Avoidance}{6} \contentsline {subsection}{\numberline {2.1}Different Types of Congestion}{6} \contentsline {subsubsection}{\numberline {2.1.1}Receiver Congestion}{6} \contentsline {subsubsection}{\numberline {2.1.2}Network Congestion}{6} \contentsline {subsection}{\numberline {2.2}Preventing Network Congestion}{7} \contentsline {subsection}{\numberline {2.3}Simplifications in a Cluster Environment}{8} \contentsline {subsection}{\numberline {2.4}Measuring Congestion Performance}{9} \contentsline {subsubsection}{\numberline {2.4.1}Introducing Netgauge}{9} \contentsline {subsubsection}{\numberline {2.4.2}Transport Modules}{9} \contentsline {subsubsection}{\numberline {2.4.3}Communication Patterns}{11} \contentsline {subsubsection}{\numberline {2.4.4}A Communication Pattern to provoke Network Congestion}{11} \contentsline {subsubsection}{\numberline {2.4.5}Results and Comparison}{12} \contentsline {section}{\numberline {3}Flow Control in a Cluster Environment}{15} \contentsline {subsection}{\numberline {3.1}Evaluation of Flow Control Schemes for TCP/IP}{15} \contentsline {subsubsection}{\numberline {3.1.1}TCP Reno}{15} \contentsline {subsubsection}{\numberline {3.1.2}High Speed TCP}{17} \contentsline {subsubsection}{\numberline {3.1.3}TCP Vegas}{17} \contentsline {subsubsection}{\numberline {3.1.4}Conclusion}{18} \contentsline {subsection}{\numberline {3.2}Design of a Flow Control Scheme suitable for Clusters}{18} \contentsline {subsubsection}{\numberline {3.2.1}Basic Principles}{19} \contentsline {subsubsection}{\numberline {3.2.2}Handling of Receiver Congestion}{19} \contentsline {subsubsection}{\numberline {3.2.3}Handling Retransmissions}{20} \contentsline {subsubsection}{\numberline {3.2.4}Requesting Transmission Slots}{21} \contentsline {subsubsection}{\numberline {3.2.5}Deciding when to send an Acknowledgement}{22} \contentsline {subsubsection}{\numberline {3.2.6}Handling multiple Senders}{22} \contentsline {subsubsection}{\numberline {3.2.7}Additional Differences to TCP}{23} \contentsline {subsubsection}{\numberline {3.2.8}Conclusion}{25} \contentsline {section}{\numberline {4}Implementation of the Flow Control for ESP}{27} \contentsline {subsection}{\numberline {4.1}Design of the ESP Output Engine}{28} \contentsline {subsection}{\numberline {4.2}Receiving Data from User-space}{29} \contentsline {subsection}{\numberline {4.3}Sending Data to the Receiver}{31} \contentsline {subsection}{\numberline {4.4}Managing the ACK queue}{32} \contentsline {subsubsection}{\numberline {4.4.1}Enqueuing an ACK}{32} \contentsline {subsection}{\numberline {4.5}State Information used by the Flow Control}{33} \contentsline {subsubsection}{\numberline {4.5.1}Per-Socket Information}{33} \contentsline {subsubsection}{\numberline {4.5.2}Global Information}{34} \contentsline {section}{\numberline {5}Targeting Production Readyness}{35} \contentsline {subsection}{\numberline {5.1}Debugging Strategies}{35} \contentsline {subsubsection}{\numberline {5.1.1}Debugging with KGDB}{35} \contentsline {subsubsection}{\numberline {5.1.2}Analyzing Kernel Oops Messages}{36} \contentsline {subsection}{\numberline {5.2}Creating an Interface for User Settings}{38} \contentsline {subsubsection}{\numberline {5.2.1}The Sysctl Interface}{38} \contentsline {subsubsection}{\numberline {5.2.2}Adding a Sysctl Interface to ESP}{39} \contentsline {subsubsection}{\numberline {5.2.3}Using the Sysctl Interface}{40} \contentsline {section}{\numberline {6}Analysis and Evaluation}{42} \contentsline {subsection}{\numberline {6.1}Comparison of the proposed Flow Control Scheme to TCP}{42} \contentsline {subsubsection}{\numberline {6.1.1}Handling of Packet Loss}{42} \contentsline {subsubsection}{\numberline {6.1.2}Ramp up Times}{42} \contentsline {subsubsection}{\numberline {6.1.3}Fairness Considerations}{43} \contentsline {subsection}{\numberline {6.2}Searching the ESP Parameter Space}{44} \contentsline {subsection}{\numberline {6.3}Assessing the Overhead}{46} \contentsline {subsubsection}{\numberline {6.3.1}The LogGP model}{46} \contentsline {subsubsection}{\numberline {6.3.2}Existing Approaches to assess LogGP Parameters}{48} \contentsline {subsubsection}{\numberline {6.3.3}Assessing the overhead with a linear system of equations}{51} \contentsline {subsubsection}{\numberline {6.3.4}The parameterized LogP model}{54} \contentsline {subsubsection}{\numberline {6.3.5}Assessing the P-LogP parameters}{55} \contentsline {subsection}{\numberline {6.4}General Overhead Assessment}{55} \contentsline {subsection}{\numberline {6.5}Throughput Comparison}{58} \contentsline {subsubsection}{\numberline {6.5.1}Single Sender}{58} \contentsline {subsubsection}{\numberline {6.5.2}Congestion Avoidance Stress Test}{58} \contentsline {subsection}{\numberline {6.6}Application Benchmarks}{60} \contentsline {subsubsection}{\numberline {6.6.1}Optimizing for real-world Applications}{60} \contentsline {subsubsection}{\numberline {6.6.2}NAS Parallel Benchmarks}{62} \contentsline {subsubsection}{\numberline {6.6.3}ABINIT}{62} \contentsline {section}{\numberline {7}Summary and Conclusion}{63} \contentsline {section}{\numberline {A}Appendix}{65} \contentsline {subsection}{Corrected Bugs and Optimizations}{65} \contentsline {subsection}{Configuration of the Cluster test Systems}{67} \contentsline {subsection}{References}{68} \contentsline {subsection}{List of Figures}{71} \contentsline {subsection}{Statutory Declaration}{72}