4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2012, 2015, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Seagate, Inc.
32 * lnet/include/lnet/lib-types.h
35 #ifndef __LNET_LIB_TYPES_H__
36 #define __LNET_LIB_TYPES_H__
38 #include <linux/kthread.h>
39 #include <linux/uio.h>
40 #include <linux/types.h>
45 /* Max payload size */
46 #define LNET_MAX_PAYLOAD CONFIG_LNET_MAX_PAYLOAD
47 #if (LNET_MAX_PAYLOAD < LNET_MTU)
48 # error "LNET_MAX_PAYLOAD too small - error in configure --with-max-payload-mb"
49 #elif (LNET_MAX_PAYLOAD > (PAGE_SIZE * LNET_MAX_IOV))
50 # error "LNET_MAX_PAYLOAD too large - error in configure --with-max-payload-mb"
56 typedef struct lnet_msg
{
57 struct list_head msg_activelist
;
58 struct list_head msg_list
; /* Q for credits/MD */
60 lnet_process_id_t msg_target
;
61 /* where is it from, it's only for building event */
65 /* committed for sending */
66 unsigned int msg_tx_committed
:1;
67 /* CPT # this message committed for sending */
68 unsigned int msg_tx_cpt
:15;
69 /* committed for receiving */
70 unsigned int msg_rx_committed
:1;
71 /* CPT # this message committed for receiving */
72 unsigned int msg_rx_cpt
:15;
73 /* queued for tx credit */
74 unsigned int msg_tx_delayed
:1;
75 /* queued for RX buffer */
76 unsigned int msg_rx_delayed
:1;
77 /* ready for pending on RX delay list */
78 unsigned int msg_rx_ready_delay
:1;
80 unsigned int msg_vmflush
:1; /* VM trying to free memory */
81 unsigned int msg_target_is_router
:1; /* sending to a router */
82 unsigned int msg_routing
:1; /* being forwarded */
83 unsigned int msg_ack
:1; /* ack on finalize (PUT) */
84 unsigned int msg_sending
:1; /* outgoing message */
85 unsigned int msg_receiving
:1; /* being received */
86 unsigned int msg_txcredit
:1; /* taken an NI send credit */
87 unsigned int msg_peertxcredit
:1; /* taken a peer send credit */
88 unsigned int msg_rtrcredit
:1; /* taken a global router credit */
89 unsigned int msg_peerrtrcredit
:1; /* taken a peer router credit */
90 unsigned int msg_onactivelist
:1; /* on the activelist */
92 struct lnet_peer
*msg_txpeer
; /* peer I'm sending to */
93 struct lnet_peer
*msg_rxpeer
; /* peer I received from */
96 struct lnet_libmd
*msg_md
;
99 unsigned int msg_wanted
;
100 unsigned int msg_offset
;
101 unsigned int msg_niov
;
102 struct kvec
*msg_iov
;
103 lnet_kiov_t
*msg_kiov
;
109 typedef struct lnet_libhandle
{
110 struct list_head lh_hash_chain
;
114 #define lh_entry(ptr, type, member) \
115 ((type *)((char *)(ptr) - (char *)(&((type *)0)->member)))
117 typedef struct lnet_eq
{
118 struct list_head eq_list
;
119 lnet_libhandle_t eq_lh
;
120 lnet_seq_t eq_enq_seq
;
121 lnet_seq_t eq_deq_seq
;
122 unsigned int eq_size
;
123 lnet_eq_handler_t eq_callback
;
124 lnet_event_t
*eq_events
;
125 int **eq_refs
; /* percpt refcount for EQ */
128 typedef struct lnet_me
{
129 struct list_head me_list
;
130 lnet_libhandle_t me_lh
;
131 lnet_process_id_t me_match_id
;
132 unsigned int me_portal
;
133 unsigned int me_pos
; /* hash offset in mt_hash */
135 __u64 me_ignore_bits
;
136 lnet_unlink_t me_unlink
;
137 struct lnet_libmd
*me_md
;
140 typedef struct lnet_libmd
{
141 struct list_head md_list
;
142 lnet_libhandle_t md_lh
;
145 unsigned int md_offset
;
146 unsigned int md_length
;
147 unsigned int md_max_size
;
150 unsigned int md_options
;
151 unsigned int md_flags
;
154 unsigned int md_niov
; /* # frags */
156 struct kvec iov
[LNET_MAX_IOV
];
157 lnet_kiov_t kiov
[LNET_MAX_IOV
];
161 #define LNET_MD_FLAG_ZOMBIE (1 << 0)
162 #define LNET_MD_FLAG_AUTO_UNLINK (1 << 1)
163 #define LNET_MD_FLAG_ABORTED (1 << 2)
166 /* info about peers we are trying to fail */
167 struct list_head tp_list
; /* ln_test_peers */
168 lnet_nid_t tp_nid
; /* matching nid */
169 unsigned int tp_threshold
; /* # failures to simulate */
172 #define LNET_COOKIE_TYPE_MD 1
173 #define LNET_COOKIE_TYPE_ME 2
174 #define LNET_COOKIE_TYPE_EQ 3
175 #define LNET_COOKIE_TYPE_BITS 2
176 #define LNET_COOKIE_MASK ((1ULL << LNET_COOKIE_TYPE_BITS) - 1ULL)
178 struct lnet_ni
; /* forward ref */
180 typedef struct lnet_lnd
{
181 /* fields managed by portals */
182 struct list_head lnd_list
; /* stash in the LND table */
183 int lnd_refcount
; /* # active instances */
185 /* fields initialised by the LND */
188 int (*lnd_startup
)(struct lnet_ni
*ni
);
189 void (*lnd_shutdown
)(struct lnet_ni
*ni
);
190 int (*lnd_ctl
)(struct lnet_ni
*ni
, unsigned int cmd
, void *arg
);
193 * In data movement APIs below, payload buffers are described as a set
194 * of 'niov' fragments which are...
196 * in virtual memory (struct iovec *iov != NULL)
198 * in pages (kernel only: plt_kiov_t *kiov != NULL).
199 * The LND may NOT overwrite these fragment descriptors.
200 * An 'offset' and may specify a byte offset within the set of
201 * fragments to start from
205 * Start sending a preformatted message. 'private' is NULL for PUT and
206 * GET messages; otherwise this is a response to an incoming message
207 * and 'private' is the 'private' passed to lnet_parse(). Return
208 * non-zero for immediate failure, otherwise complete later with
211 int (*lnd_send
)(struct lnet_ni
*ni
, void *private, lnet_msg_t
*msg
);
214 * Start receiving 'mlen' bytes of payload data, skipping the following
215 * 'rlen' - 'mlen' bytes. 'private' is the 'private' passed to
216 * lnet_parse(). Return non-zero for immediate failure, otherwise
217 * complete later with lnet_finalize(). This also gives back a receive
218 * credit if the LND does flow control.
220 int (*lnd_recv
)(struct lnet_ni
*ni
, void *private, lnet_msg_t
*msg
,
221 int delayed
, unsigned int niov
,
222 struct kvec
*iov
, lnet_kiov_t
*kiov
,
223 unsigned int offset
, unsigned int mlen
,
227 * lnet_parse() has had to delay processing of this message
228 * (e.g. waiting for a forwarding buffer or send credits). Give the
229 * LND a chance to free urgently needed resources. If called, return 0
230 * for success and do NOT give back a receive credit; that has to wait
231 * until lnd_recv() gets called. On failure return < 0 and
232 * release resources; lnd_recv() will not be called.
234 int (*lnd_eager_recv
)(struct lnet_ni
*ni
, void *private,
235 lnet_msg_t
*msg
, void **new_privatep
);
237 /* notification of peer health */
238 void (*lnd_notify
)(struct lnet_ni
*ni
, lnet_nid_t peer
, int alive
);
240 /* query of peer aliveness */
241 void (*lnd_query
)(struct lnet_ni
*ni
, lnet_nid_t peer
,
242 unsigned long *when
);
244 /* accept a new connection */
245 int (*lnd_accept
)(struct lnet_ni
*ni
, struct socket
*sock
);
248 struct lnet_tx_queue
{
249 int tq_credits
; /* # tx credits free */
250 int tq_credits_min
; /* lowest it's been */
251 int tq_credits_max
; /* total # tx credits */
252 struct list_head tq_delayed
; /* delayed TXs */
255 typedef struct lnet_ni
{
257 struct list_head ni_list
; /* chain on ln_nis */
258 struct list_head ni_cptlist
; /* chain on ln_nis_cpt */
259 int ni_maxtxcredits
; /* # tx credits */
260 /* # per-peer send credits */
261 int ni_peertxcredits
;
262 /* # per-peer router buffer credits */
263 int ni_peerrtrcredits
;
264 /* seconds to consider peer dead */
266 int ni_ncpts
; /* number of CPTs */
267 __u32
*ni_cpts
; /* bond NI on some CPTs */
268 lnet_nid_t ni_nid
; /* interface's NID */
269 void *ni_data
; /* instance-specific data */
270 lnd_t
*ni_lnd
; /* procedural interface */
271 struct lnet_tx_queue
**ni_tx_queues
; /* percpt TX queues */
272 int **ni_refs
; /* percpt reference count */
273 time64_t ni_last_alive
;/* when I was last alive */
274 lnet_ni_status_t
*ni_status
; /* my health status */
275 /* equivalent interfaces to use */
276 char *ni_interfaces
[LNET_MAX_INTERFACES
];
279 #define LNET_PROTO_PING_MATCHBITS 0x8000000000000000LL
282 * NB: value of these features equal to LNET_PROTO_PING_VERSION_x
283 * of old LNet, so there shouldn't be any compatibility issue
285 #define LNET_PING_FEAT_INVAL (0) /* no feature */
286 #define LNET_PING_FEAT_BASE (1 << 0) /* just a ping */
287 #define LNET_PING_FEAT_NI_STATUS (1 << 1) /* return NI status */
288 #define LNET_PING_FEAT_RTE_DISABLED (1 << 2) /* Routing enabled */
290 #define LNET_PING_FEAT_MASK (LNET_PING_FEAT_BASE | \
291 LNET_PING_FEAT_NI_STATUS)
293 /* router checker data, per router */
294 #define LNET_MAX_RTR_NIS 16
295 #define LNET_PINGINFO_SIZE offsetof(lnet_ping_info_t, pi_ni[LNET_MAX_RTR_NIS])
297 /* chain on the_lnet.ln_zombie_rcd or ln_deathrow_rcd */
298 struct list_head rcd_list
;
299 lnet_handle_md_t rcd_mdh
; /* ping buffer MD */
300 struct lnet_peer
*rcd_gateway
; /* reference to gateway */
301 lnet_ping_info_t
*rcd_pinginfo
; /* ping buffer */
304 typedef struct lnet_peer
{
305 struct list_head lp_hashlist
; /* chain on peer hash */
306 struct list_head lp_txq
; /* messages blocking for
308 struct list_head lp_rtrq
; /* messages blocking for
310 struct list_head lp_rtr_list
; /* chain on router list */
311 int lp_txcredits
; /* # tx credits available */
312 int lp_mintxcredits
; /* low water mark */
313 int lp_rtrcredits
; /* # router credits */
314 int lp_minrtrcredits
; /* low water mark */
315 unsigned int lp_alive
:1; /* alive/dead? */
316 unsigned int lp_notify
:1; /* notification outstanding? */
317 unsigned int lp_notifylnd
:1;/* outstanding notification
319 unsigned int lp_notifying
:1; /* some thread is handling
321 unsigned int lp_ping_notsent
;/* SEND event outstanding
323 int lp_alive_count
; /* # times router went
325 long lp_txqnob
; /* bytes queued for sending */
326 unsigned long lp_timestamp
; /* time of last aliveness
328 unsigned long lp_ping_timestamp
;/* time of last ping
330 unsigned long lp_ping_deadline
; /* != 0 if ping reply
332 unsigned long lp_last_alive
; /* when I was last alive */
333 unsigned long lp_last_query
; /* when lp_ni was queried
335 lnet_ni_t
*lp_ni
; /* interface peer is on */
336 lnet_nid_t lp_nid
; /* peer's NID */
337 int lp_refcount
; /* # refs */
338 int lp_cpt
; /* CPT this peer attached on */
339 /* # refs from lnet_route_t::lr_gateway */
341 /* returned RC ping features */
342 unsigned int lp_ping_feats
;
343 struct list_head lp_routes
; /* routers on this peer */
344 lnet_rc_data_t
*lp_rcd
; /* router checker state */
348 #define LNET_PEER_HASH_BITS 9
349 #define LNET_PEER_HASH_SIZE (1 << LNET_PEER_HASH_BITS)
351 /* peer hash table */
352 struct lnet_peer_table
{
353 int pt_version
; /* /proc validity stamp */
354 int pt_number
; /* # peers extant */
355 /* # zombies to go to deathrow (and not there yet) */
357 struct list_head pt_deathrow
; /* zombie peers */
358 struct list_head
*pt_hash
; /* NID->peer hash */
362 * peer aliveness is enabled only on routers for peers in a network where the
363 * lnet_ni_t::ni_peertimeout has been set to a positive value
365 #define lnet_peer_aliveness_enabled(lp) (the_lnet.ln_routing && \
366 (lp)->lp_ni->ni_peertimeout > 0)
369 struct list_head lr_list
; /* chain on net */
370 struct list_head lr_gwlist
; /* chain on gateway */
371 lnet_peer_t
*lr_gateway
; /* router node */
372 __u32 lr_net
; /* remote network number */
373 int lr_seq
; /* sequence for round-robin */
374 unsigned int lr_downis
; /* number of down NIs */
375 unsigned int lr_hops
; /* how far I am */
376 unsigned int lr_priority
; /* route priority */
379 #define LNET_REMOTE_NETS_HASH_DEFAULT (1U << 7)
380 #define LNET_REMOTE_NETS_HASH_MAX (1U << 16)
381 #define LNET_REMOTE_NETS_HASH_SIZE (1 << the_lnet.ln_remote_nets_hbits)
384 struct list_head lrn_list
; /* chain on
385 ln_remote_nets_hash */
386 struct list_head lrn_routes
; /* routes to me */
387 __u32 lrn_net
; /* my net number */
390 /** lnet message has credit and can be submitted to lnd for send/receive */
391 #define LNET_CREDIT_OK 0
392 /** lnet message is waiting for credit */
393 #define LNET_CREDIT_WAIT 1
396 struct list_head rbp_bufs
; /* my free buffer pool */
397 struct list_head rbp_msgs
; /* messages blocking
399 int rbp_npages
; /* # pages in each buffer */
400 int rbp_nbuffers
; /* # buffers */
401 int rbp_credits
; /* # free buffers /
403 int rbp_mincredits
; /* low water mark */
407 struct list_head rb_list
; /* chain on rbp_bufs */
408 lnet_rtrbufpool_t
*rb_pool
; /* owning pool */
409 lnet_kiov_t rb_kiov
[0]; /* the buffer space */
412 #define LNET_PEER_HASHSIZE 503 /* prime! */
414 #define LNET_TINY_BUF_IDX 0
415 #define LNET_SMALL_BUF_IDX 1
416 #define LNET_LARGE_BUF_IDX 2
418 /* # different router buffer pools */
419 #define LNET_NRBPOOLS (LNET_LARGE_BUF_IDX + 1)
422 /* Didn't match anything */
423 LNET_MATCHMD_NONE
= (1 << 0),
425 LNET_MATCHMD_OK
= (1 << 1),
426 /* Must be discarded */
427 LNET_MATCHMD_DROP
= (1 << 2),
428 /* match and buffer is exhausted */
429 LNET_MATCHMD_EXHAUSTED
= (1 << 3),
431 LNET_MATCHMD_FINISH
= (LNET_MATCHMD_OK
| LNET_MATCHMD_DROP
),
434 /* Options for lnet_portal_t::ptl_options */
435 #define LNET_PTL_LAZY (1 << 0)
436 #define LNET_PTL_MATCH_UNIQUE (1 << 1) /* unique match, for RDMA */
437 #define LNET_PTL_MATCH_WILDCARD (1 << 2) /* wildcard match,
440 /* parameter for matching operations (GET, PUT) */
441 struct lnet_match_info
{
443 lnet_process_id_t mi_id
;
445 unsigned int mi_portal
;
446 unsigned int mi_rlength
;
447 unsigned int mi_roffset
;
450 /* ME hash of RDMA portal */
451 #define LNET_MT_HASH_BITS 8
452 #define LNET_MT_HASH_SIZE (1 << LNET_MT_HASH_BITS)
453 #define LNET_MT_HASH_MASK (LNET_MT_HASH_SIZE - 1)
455 * we allocate (LNET_MT_HASH_SIZE + 1) entries for lnet_match_table::mt_hash,
456 * the last entry is reserved for MEs with ignore-bits
458 #define LNET_MT_HASH_IGNORE LNET_MT_HASH_SIZE
460 * __u64 has 2^6 bits, so need 2^(LNET_MT_HASH_BITS - LNET_MT_BITS_U64) which
461 * is 4 __u64s as bit-map, and add an extra __u64 (only use one bit) for the
462 * ME-list with ignore-bits, which is mtable::mt_hash[LNET_MT_HASH_IGNORE]
464 #define LNET_MT_BITS_U64 6 /* 2^6 bits */
465 #define LNET_MT_EXHAUSTED_BITS (LNET_MT_HASH_BITS - LNET_MT_BITS_U64)
466 #define LNET_MT_EXHAUSTED_BMAP ((1 << LNET_MT_EXHAUSTED_BITS) + 1)
468 /* portal match table */
469 struct lnet_match_table
{
470 /* reserved for upcoming patches, CPU partition ID */
472 unsigned int mt_portal
; /* portal index */
474 * match table is set as "enabled" if there's non-exhausted MD
475 * attached on mt_mhash, it's only valid for wildcard portal
477 unsigned int mt_enabled
;
478 /* bitmap to flag whether MEs on mt_hash are exhausted or not */
479 __u64 mt_exhausted
[LNET_MT_EXHAUSTED_BMAP
];
480 struct list_head
*mt_mhash
; /* matching hash */
483 /* these are only useful for wildcard portal */
484 /* Turn off message rotor for wildcard portals */
485 #define LNET_PTL_ROTOR_OFF 0
486 /* round-robin dispatch all PUT messages for wildcard portals */
487 #define LNET_PTL_ROTOR_ON 1
488 /* round-robin dispatch routed PUT message for wildcard portals */
489 #define LNET_PTL_ROTOR_RR_RT 2
490 /* dispatch routed PUT message by hashing source NID for wildcard portals */
491 #define LNET_PTL_ROTOR_HASH_RT 3
493 typedef struct lnet_portal
{
495 unsigned int ptl_index
; /* portal ID, reserved */
496 /* flags on this portal: lazy, unique... */
497 unsigned int ptl_options
;
498 /* list of messages which are stealing buffer */
499 struct list_head ptl_msg_stealing
;
500 /* messages blocking for MD */
501 struct list_head ptl_msg_delayed
;
502 /* Match table for each CPT */
503 struct lnet_match_table
**ptl_mtables
;
504 /* spread rotor of incoming "PUT" */
505 unsigned int ptl_rotor
;
506 /* # active entries for this portal */
508 /* array of active entries' cpu-partition-id */
512 #define LNET_LH_HASH_BITS 12
513 #define LNET_LH_HASH_SIZE (1ULL << LNET_LH_HASH_BITS)
514 #define LNET_LH_HASH_MASK (LNET_LH_HASH_SIZE - 1)
516 /* resource container (ME, MD, EQ) */
517 struct lnet_res_container
{
518 unsigned int rec_type
; /* container type */
519 __u64 rec_lh_cookie
; /* cookie generator */
520 struct list_head rec_active
; /* active resource list */
521 struct list_head
*rec_lh_hash
; /* handle hash */
524 /* message container */
525 struct lnet_msg_container
{
526 int msc_init
; /* initialized or not */
527 /* max # threads finalizing */
529 /* msgs waiting to complete finalizing */
530 struct list_head msc_finalizing
;
531 struct list_head msc_active
; /* active message list */
532 /* threads doing finalization */
533 void **msc_finalizers
;
536 /* Router Checker states */
537 #define LNET_RC_STATE_SHUTDOWN 0 /* not started */
538 #define LNET_RC_STATE_RUNNING 1 /* started up OK */
539 #define LNET_RC_STATE_STOPPING 2 /* telling thread to stop */
542 /* CPU partition table of LNet */
543 struct cfs_cpt_table
*ln_cpt_table
;
544 /* number of CPTs in ln_cpt_table */
545 unsigned int ln_cpt_number
;
546 unsigned int ln_cpt_bits
;
548 /* protect LNet resources (ME/MD/EQ) */
549 struct cfs_percpt_lock
*ln_res_lock
;
552 /* the vector of portals */
553 lnet_portal_t
**ln_portals
;
554 /* percpt ME containers */
555 struct lnet_res_container
**ln_me_containers
;
556 /* percpt MD container */
557 struct lnet_res_container
**ln_md_containers
;
559 /* Event Queue container */
560 struct lnet_res_container ln_eq_container
;
561 wait_queue_head_t ln_eq_waitq
;
562 spinlock_t ln_eq_wait_lock
;
563 unsigned int ln_remote_nets_hbits
;
565 /* protect NI, peer table, credits, routers, rtrbuf... */
566 struct cfs_percpt_lock
*ln_net_lock
;
567 /* percpt message containers for active/finalizing/freed message */
568 struct lnet_msg_container
**ln_msg_containers
;
569 lnet_counters_t
**ln_counters
;
570 struct lnet_peer_table
**ln_peer_tables
;
571 /* failure simulation */
572 struct list_head ln_test_peers
;
574 struct list_head ln_nis
; /* LND instances */
575 /* NIs bond on specific CPT(s) */
576 struct list_head ln_nis_cpt
;
577 /* dying LND instances */
578 struct list_head ln_nis_zombie
;
579 lnet_ni_t
*ln_loni
; /* the loopback NI */
580 /* NI to wait for events in */
581 lnet_ni_t
*ln_eq_waitni
;
583 /* remote networks with routes to them */
584 struct list_head
*ln_remote_nets_hash
;
586 __u64 ln_remote_nets_version
;
587 /* list of all known routers */
588 struct list_head ln_routers
;
590 __u64 ln_routers_version
;
591 /* percpt router buffer pools */
592 lnet_rtrbufpool_t
**ln_rtrpools
;
594 lnet_handle_md_t ln_ping_target_md
;
595 lnet_handle_eq_t ln_ping_target_eq
;
596 lnet_ping_info_t
*ln_ping_info
;
598 /* router checker startup/shutdown state */
600 /* router checker's event queue */
601 lnet_handle_eq_t ln_rc_eqh
;
602 /* rcd still pending on net */
603 struct list_head ln_rcd_deathrow
;
604 /* rcd ready for free */
605 struct list_head ln_rcd_zombie
;
606 /* serialise startup/shutdown */
607 struct semaphore ln_rc_signal
;
609 struct mutex ln_api_mutex
;
610 struct mutex ln_lnd_mutex
;
611 int ln_init
; /* lnet_init()
613 /* Have I called LNetNIInit myself? */
615 /* LNetNIInit/LNetNIFini counter */
617 /* shutdown in progress */
620 int ln_routing
; /* am I a router? */
621 lnet_pid_t ln_pid
; /* requested pid */
622 /* uniquely identifies this ni in this epoch */
623 __u64 ln_interface_cookie
;
624 /* registered LNDs */
625 struct list_head ln_lnds
;
627 /* test protocol compatibility flags */
628 int ln_testprotocompat
;
631 * 0 - load the NIs from the mod params
632 * 1 - do not load the NIs from the mod params
633 * Reverse logic to ensure that other calls to LNetNIInit
636 bool ln_nis_from_mod_params
;