Implement support for Path MTU Discovery
authorStefan Berghofer <stefan.berghofer@secunet.com>
Fri, 5 Jun 2015 13:54:32 +0000 (15:54 +0200)
committerReto Buerki <reet@codelabs.ch>
Thu, 11 Jun 2015 07:32:05 +0000 (09:32 +0200)
The module's pmtu parameter allows the specification of input channels
from which discovered Path MTU values are read. If an outgoing packet
exceeds the associated PMTU value, an ICMP Fragmentation Needed (Type 3,
Code 4) message containing the current PMTU value is sent back to update
the path. In case of IPv6, an ICMPv6 Packet Too Big (Type 2) message is
used.

In order for this to work, the outgoing packets must be marked by
Netfilter's nfmark functionality. The configured mark specifies the
index into the PMTU array.

internal.h
net.c
writer.c

index 0c04bc9..b1269ae 100644 (file)
@@ -50,6 +50,8 @@ struct dev_info {
        struct net_device_stats stats;   /**< contains receive and transmit information            */
        char *bus_info;                  /**< text representation for input and output association */
        int mtu;                         /**< MTU for this interface                               */
+       u32 *pmtu;                       /**< PMTUs for this interface                             */
+       size_t pmtu_elements;            /**< maximum number of PMTUs supported                    */
        unsigned long flags;             /**< flags given on the command line                      */
        spinlock_t writer_lock;          /**< lock for accessing the writer part                   */
        struct muchannel *channel_out;   /**< output channel for write operations                  */
@@ -176,7 +178,8 @@ void writer_down(struct dev_info *dev_info);
  * @brief Initializes the network device writer part
  */
 int initialize_writer(struct dev_info *dev_info,
-                     const struct muen_channel_info * const region);
+                     const struct muen_channel_info * const region,
+                     const struct muen_channel_info * const pmtu_region);
 
 /**
  * @brief Shuts down the network device writer part
diff --git a/net.c b/net.c
index 2b2ee6d..bd0ac68 100644 (file)
--- a/net.c
+++ b/net.c
@@ -262,6 +262,7 @@ static int add_device(const char *device_name,
                      const char *input,
                      const char *output,
                      int mtu,
+                     const char *pmtu,
                      u64 writer_protocol,
                      u64 reader_protocol,
                      unsigned long flags,
@@ -271,7 +272,7 @@ static int add_device(const char *device_name,
        struct net_device *dev;
        struct dev_info *dev_info;
        size_t bus_info_len = 2; /* place for separator and finishing \0 */
-       struct muen_channel_info reader_channel, writer_channel;
+       struct muen_channel_info reader_channel, writer_channel, pmtu_channel;
 
        if (input)
                bus_info_len += strlen(input);
@@ -335,7 +336,20 @@ static int add_device(const char *device_name,
                                   "Output channel '%s' not found\n", output);
                        goto err_cleanup_reader;
                }
-               ret = initialize_writer(dev_info, &writer_channel);
+
+               if (pmtu && strlen(pmtu) > 0) {
+                       if (!muen_get_channel_info(pmtu, &pmtu_channel)) {
+                               netdev_err(dev_info->dev,
+                                          "PMTU channel '%s' not found\n",
+                                          pmtu);
+                               goto err_cleanup_reader;
+                       }
+                       ret = initialize_writer(dev_info, &writer_channel,
+                                               &pmtu_channel);
+               } else
+                       ret = initialize_writer(dev_info, &writer_channel,
+                                               NULL);
+
                if (ret < 0) {
                        netdev_err(dev_info->dev,
                                   "Unable to init writer (status: %d)\n", ret);
@@ -415,6 +429,14 @@ static char *out[MAX_INTERFACES];
  */
 static char *mtu[MAX_INTERFACES];
 
+/**
+ * @brief Memory regions for PMTU values
+ *
+ * This array is filled with the list of memory regions holding the PMTU values
+ * for each writer.
+ */
+static char *pmtu[MAX_INTERFACES];
+
 /**
  * @brief Interface flags
  *
@@ -456,6 +478,8 @@ module_param_array(in, charp, NULL, 0444);
 MODULE_PARM_DESC(in, "List of input memregions, separated with comma (empty values permitted)");
 module_param_array(out, charp, NULL, 0444);
 MODULE_PARM_DESC(out, "List of output memregions, separated with comma (empty values permitted)");
+module_param_array(pmtu, charp, NULL, 0444);
+MODULE_PARM_DESC(pmtu, "List of input memregions holding PMTU values");
 module_param_array(mtu, charp, NULL, 0444);
 MODULE_PARM_DESC(mtu, "List of MTUs to use, separated with comma (default is 1500)");
 module_param_array(writer_protocol, charp, NULL, 0444);
@@ -600,7 +624,7 @@ static int __init muennet_init(void)
                        flag_value = ret;
                }
 
-               ret = add_device(name[i], in[i], out[i], device_mtu,
+               ret = add_device(name[i], in[i], out[i], device_mtu, pmtu[i],
                                 device_writer_protocol, device_reader_protocol,
                                 flag_value, poll);
                if (ret < 0)
index e8582a3..8fdc3af 100644 (file)
--- a/writer.c
+++ b/writer.c
@@ -17,6 +17,9 @@
  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
+#include <linux/inetdevice.h>
+#include <net/ip.h>
+#include <net/icmp.h>
 #include <muen/writer.h>
 
 #include "internal.h"
@@ -104,7 +107,8 @@ void cleanup_writer(struct dev_info *dev_info)
  * @return errors returned by #common_check_region
  */
 int initialize_writer(struct dev_info *dev_info,
-                     const struct muen_channel_info * const channel)
+                     const struct muen_channel_info * const channel,
+                     const struct muen_channel_info * const pmtu_channel)
 {
        /* some sanity checks */
        if (!channel->writable) {
@@ -125,6 +129,21 @@ int initialize_writer(struct dev_info *dev_info,
                return -EFAULT;
        }
 
+       if (pmtu_channel != NULL) {
+               dev_info->pmtu_elements = pmtu_channel->size / sizeof(u32);
+
+               dev_info->pmtu = ioremap_cache(pmtu_channel->address,
+                                              pmtu_channel->size);
+               if (dev_info->channel_out == NULL) {
+                       netdev_err(dev_info->dev,
+                                  "Unable to map writer PMTU channel\n");
+                       return -EFAULT;
+               }
+       } else {
+               dev_info->pmtu_elements = 0;
+               dev_info->pmtu = NULL;
+       }
+
        /* initialize the lock */
        spin_lock_init(&dev_info->writer_lock);
 
@@ -175,6 +194,50 @@ int muennet_xmit(struct sk_buff *skb, struct net_device *dev)
        if (!spin_trylock_irqsave(&dev_info->writer_lock, flags))
                return NETDEV_TX_LOCKED;
 
+       if (dev_info->pmtu != NULL &&
+           1 <= skb->mark && skb->mark <= dev_info->pmtu_elements) {
+               u32 pmtu = dev_info->pmtu[skb->mark - 1];
+
+               if (skb->len > pmtu) {
+                       struct iphdr *iph;
+                       struct flowi4 fl4;
+                       struct rtable *rt = NULL;
+
+                       switch (skb->protocol) {
+                       case htons(ETH_P_IP):
+                               spin_unlock_irqrestore(&dev_info->writer_lock,
+                                                      flags);
+                               iph = ip_hdr(skb);
+                               memset(&fl4, 0, sizeof(fl4));
+                               fl4.flowi4_oif = dev->ifindex;
+                               fl4.flowi4_tos = RT_TOS(iph->tos);
+                               fl4.daddr = iph->daddr;
+                               fl4.saddr = inet_select_addr(dev, iph->saddr,
+                                                            RT_SCOPE_UNIVERSE);
+
+                               rt = ip_route_output_key(dev_net(dev), &fl4);
+                               if (!IS_ERR(rt)) {
+                                       skb_dst_set(skb, &rt->dst);
+                                       icmp_send(skb, ICMP_DEST_UNREACH,
+                                                 ICMP_FRAG_NEEDED,
+                                                 htonl(pmtu));
+                               } else
+                                       netdev_err(dev_info->dev,
+                                                  "Route lookup for ICMP failed (dst: %pI4, src: %pI4)\n",
+                                                  &fl4.daddr, &fl4.saddr);
+
+                               dev_kfree_skb(skb);
+                               return NET_XMIT_SUCCESS;
+                       case htons(ETH_P_IPV6):
+                               spin_unlock_irqrestore(&dev_info->writer_lock,
+                                                      flags);
+                               icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, pmtu);
+                               dev_kfree_skb(skb);
+                               return NET_XMIT_SUCCESS;
+                       }
+               }
+       }
+
        if ((dev_info->flags & MUENNET_HDR)) {
                int len = skb->len;
                struct net_hdr *hdr;