-
Notifications
You must be signed in to change notification settings - Fork 7
/
mpls_bpf_kern.c
311 lines (263 loc) · 10.5 KB
/
mpls_bpf_kern.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
/*******************************************************************************************
* MPLSinIP eBPF
* This file contains a BPF (Berkeley Packet Filter) for use within tc
* (traffic-control).
*
* BPF is a virtual-machine within the Linux kernel that supports a limited
* instruction set (not Turing complete). It allows user supplied code to be
* executed during key points within the kernel. The kernel verifies all BPF
* programs so that they don't address invalid memory & that the time spent in
* the BPF program is limited by dis-allowing loops & setting a maximum number
* of instructions.
*
*
* ----------------------------------------------------------------------------------------
* eBPF Guide & Checklist
*
* 1. eBPF does not support method calls so any function called from the
* entry-point needs to be inlined.
* 2. The kernel can JIT the eBPF however prior to 4.15, it was off by default
* (value 0). echo 1 > /proc/sys/net/core/bpf_jit_enable
*
* @author Farid Zakaria <farid.m.zakaria\@gmail.com>
*******************************************************************************************/
#include <assert.h>
#include <linux/bpf.h>
#include <linux/if_ether.h>
#include <linux/ip.h>
#include <linux/pkt_cls.h>
#include "bpf_endian.h"
#include "bpf_helpers.h"
#include "helpers.h"
#include "mpls.h"
/*
* For learning purposes let's just use a fixed MPLS label
*/
#define MPLS_STATIC_LABEL 69
/**
* Adjust room at the network layer
* (room space is added or removed below the layer 3 header).
* https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L1536
*/
#define BPF_ADJ_ROOM_NET 0
/**
* A really simple BPF map that controls a switch
* whether the debug printk messages are emitted.
*/
struct bpf_elf_map SEC("maps") DEBUGS_MAP = {
.type = BPF_MAP_TYPE_ARRAY,
.size_key = sizeof(unsigned int),
.size_value = sizeof(bool),
.pinning = PIN_GLOBAL_NS,
.max_elem = 1,
};
/*
* Check whether the debug flag is set via user space.
*/
bool is_debug(void);
forced_inline bool is_debug() {
int index = 0; // the map has size of 1 so index is always 0
bool *value = (bool *)bpf_map_lookup_elem(&DEBUGS_MAP, &index);
if (!value) {
return false;
}
return *value;
}
#define bpf_debug_printk(fmt, ...) \
({ \
if (unlikely(is_debug())) { \
char ____fmt[] = fmt; \
bpf_trace_printk(____fmt, sizeof(____fmt), ##__VA_ARGS__); \
} \
})
/*
* The Internet Protocol (IP) is defined in RFC 791.
* The RFC specifies the format of the IP header.
* In the header there is the IHL (Internet Header Length) field which is 4bit
* long
* and specifies the header length in 32bit words.
* The IHL field can hold values from 0 (Binary 0000) to 15 (Binary 1111).
* 15 * 32bits = 480bits = 60 bytes
*/
#define MAX_IP_HDR_LEN 60
static_assert(sizeof(struct ethhdr) == ETH_HLEN,
"ethernet header size does not match.");
/*
* Since packet handling and printk can be interleaved, this will
* add a unique identifier for an individual invocation so you can grep the
* request identifier and see the log messags in isolation.
*
* This is a macro because in a real-example you might want to make this
* a no-op for non-debug builds to avoid the cost of the call.
*/
#define REQUEST_ID() bpf_get_prandom_u32()
#define DEBUG(x, ...) bpf_debug_printk(x, ##__VA_ARGS__)
#define DEBUG_DECAP(id, x, ...) DEBUG("[decap][%u]" x, id, ##__VA_ARGS__)
#define DEBUG_ENCAP(id, x, ...) DEBUG("[encap][%u]" x, id, ##__VA_ARGS__)
/*
* Entry point for the encapsulation & decapsulation eBPF
* __sk_buff is a "shadow" struct of the internal sk_buff.
* You can read more about how sk_buff works at:
* http://vger.kernel.org/~davem/skb_data.html
* @skb the socket buffer struct
*/
int mpls_decap_filter(struct __sk_buff *skb);
int mpls_encap_filter(struct __sk_buff *skb);
SEC("mpls_decap") int mpls_decap_filter(struct __sk_buff *skb) {
// Generate a unique request id so we can identify each flow in
// the trace logs
unsigned long long request_id = REQUEST_ID();
DEBUG_DECAP(request_id, "starting mpls decap.\n");
/*
* the redundant casts are needed according to the documentation.
* possibly for the BPF verifier.
* https://www.spinics.net/lists/xdp-newbies/msg00181.html
*/
void *data_end = (void *)(long)skb->data_end;
void *data = (void *)(long)skb->data;
// The packet starts with the ethernet header, so let's get that going:
struct ethhdr *eth = (struct ethhdr *)(data);
/*
* Now, we can't just go "eth->h_proto", that's illegal. We have to
* explicitly test that such an access is in range and doesn't go
* beyond "data_end" -- again for the verifier.
* The eBPF verifier will see that "eth" holds a packet pointer,
* and also that you have made sure that from "eth" to "eth + 1"
* is inside the valid access range for the packet.
*/
if ((void *)(eth + 1) > data_end) {
DEBUG_DECAP(request_id, "socket buffer struct was malformed.\n");
return TC_ACT_SHOT;
}
/*
* We only care about IP packet frames. Don't do anything to other ethernet
* packets like ARP.
* hton -> host to network order. Network order is always big-endian.
* pedantic: the protocol is also directly accessible from __sk_buf
*/
if (eth->h_proto != bpf_htons(ETH_P_IP)) {
DEBUG_DECAP(request_id, " ethernet is not wrapping IP packet: 0x%x\n",
bpf_ntohs(eth->h_proto));
return TC_ACT_OK;
}
struct iphdr *iph = (struct iphdr *)(void *)(eth + 1);
if ((void *)(iph + 1) > data_end) {
DEBUG_DECAP(request_id, "socket buffer struct was malformed.\n");
return TC_ACT_SHOT;
}
// multiply ip header by 4 (bytes) to get the number of bytes of the header.
int iph_len = iph->ihl << 2;
if (iph_len > MAX_IP_HDR_LEN) {
DEBUG_DECAP(request_id, "ip header is too long: %d.\n", iph_len);
return TC_ACT_SHOT;
}
// https://tools.ietf.org/html/rfc4023
//==[ETH HDR][IP HDR][MPLS HDR][Data]==
struct mpls_hdr *mpls = (struct mpls_hdr *)((void *)(iph) + iph_len);
if ((void *)(mpls + 1) > data_end) {
DEBUG_DECAP(request_id, "socket buffer struct was malformed.\n");
return TC_ACT_SHOT;
}
struct mpls_entry_decoded mpls_decoded = mpls_entry_decode(mpls);
DEBUG_DECAP(request_id, "decoded MPLS label: 0x%x\n", mpls_decoded.label);
if (!is_mpls_entry_bos(mpls)) {
DEBUG_DECAP(request_id, "mpls label not bottom of stack.\n");
return TC_ACT_SHOT;
}
/*
* This is the amount of padding we need to remove to be just left
* with eth * iphdr.
*/
int padlen = sizeof(struct mpls_hdr);
/*
* Grow or shrink the room for data in the packet associated to
* skb by length and according to the selected mode.
* BPF_ADJ_ROOM_NET: Adjust room at the network layer
* (room space is added or removed below the layer 3 header).
*/
int ret = bpf_skb_adjust_room(skb, -padlen, BPF_ADJ_ROOM_NET, 0);
if (ret) {
DEBUG_DECAP(request_id, "error calling skb adjust room.\n");
return TC_ACT_SHOT;
}
DEBUG_DECAP(request_id, "finished mpls decap.\n");
return TC_ACT_OK;
}
SEC("mpls_encap") int mpls_encap_filter(struct __sk_buff *skb) {
// Generate a unique request id so we can identify each flow in
// the trace logs
unsigned long long request_id = REQUEST_ID();
DEBUG_ENCAP(request_id, "starting mpls encap.\n");
/*
* the redundant casts are needed according to the documentation.
* possibly for the BPF verifier.
* https://www.spinics.net/lists/xdp-newbies/msg00181.html
*/
void *data_end = (void *)(long)skb->data_end;
void *data = (void *)(long)skb->data;
// The packet starts with the ethernet header, so let's get that going:
struct ethhdr *eth = (struct ethhdr *)(data);
/*
* Now, we can't just go "eth->h_proto", that's illegal. We have to
* explicitly test that such an access is in range and doesn't go
* beyond "data_end" -- again for the verifier.
* The eBPF verifier will see that "eth" holds a packet pointer,
* and also that you have made sure that from "eth" to "eth + 1"
* is inside the valid access range for the packet.
*/
if ((void *)(eth + 1) > data_end) {
DEBUG_ENCAP(request_id, "socket buffer struct was malformed.\n");
return TC_ACT_SHOT;
}
DEBUG_ENCAP(request_id, "casted to eth header.\n");
/*
* We only care about IP packet frames. Don't do anything to other ethernet
* packets like ARP.
* hton -> host to network order. Network order is always big-endian.
* pedantic: the protocol is also directly accessible from __sk_buf
*/
if (eth->h_proto != bpf_htons(ETH_P_IP)) {
DEBUG_ENCAP(request_id, "ethernet is not wrapping IP packet: 0x%x\n",
bpf_ntohs(eth->h_proto));
return TC_ACT_OK;
}
struct iphdr *iph = (struct iphdr *)(void *)(eth + 1);
if ((void *)(iph + 1) > data_end) {
DEBUG_ENCAP(request_id, "socket buffer struct was malformed.\n");
return TC_ACT_SHOT;
}
DEBUG_ENCAP(request_id, "casted to ip header.\n");
// multiply ip header by 4 (bytes) to get the number of bytes of the header.
int iph_len = iph->ihl << 2;
if (iph_len > MAX_IP_HDR_LEN) {
DEBUG_ENCAP(request_id, "ip header is too long: %d\n", iph_len);
return TC_ACT_SHOT;
}
DEBUG_ENCAP(request_id, "calculated ip header length.\n");
/*
* This is the amount of padding we need to remove to be just left
* with eth * iphdr.
*/
int padlen = sizeof(struct mpls_hdr);
/*
* Grow or shrink the room for data in the packet associated to
* skb by length and according to the selected mode.
* BPF_ADJ_ROOM_NET: Adjust room at the network layer
* (room space is added or removed below the layer 3 header).
*/
int ret = bpf_skb_adjust_room(skb, padlen, BPF_ADJ_ROOM_NET, 0);
if (ret) {
DEBUG_ENCAP(request_id, "error calling skb adjust room.\n");
return TC_ACT_SHOT;
}
DEBUG_ENCAP(request_id, "about to store bytes of MPLS label: 0x%x\n",
MPLS_STATIC_LABEL);
// construct our deterministic mpls header
struct mpls_hdr mpls = mpls_encode(MPLS_STATIC_LABEL, 123, 0, true);
unsigned long offset = sizeof(struct ethhdr) + (unsigned long)iph_len;
ret = bpf_skb_store_bytes(skb, (int)offset, &mpls, sizeof(struct mpls_hdr),
BPF_F_RECOMPUTE_CSUM);
DEBUG_ENCAP(request_id, "finished mpls encap.\n");
return TC_ACT_OK;
}
static char _license[] SEC("license") = "GPL";