--- /dev/null
+# SPDX-License-Identifier: GPL-2.0
+#
+# clang-format configuration file. Intended for clang-format >= 4.
+#
+# For more information, see:
+#
+# Documentation/process/clang-format.rst
+# https://clang.llvm.org/docs/ClangFormat.html
+# https://clang.llvm.org/docs/ClangFormatStyleOptions.html
+#
+---
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+#AlignEscapedNewlines: Left # Unknown to clang-format-4.0
+AlignOperands: true
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: None
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: false
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+ AfterClass: false
+ AfterControlStatement: false
+ AfterEnum: false
+ AfterFunction: true
+ AfterNamespace: true
+ AfterObjCDeclaration: false
+ AfterStruct: false
+ AfterUnion: false
+ #AfterExternBlock: false # Unknown to clang-format-5.0
+ BeforeCatch: false
+ BeforeElse: false
+ IndentBraces: false
+ #SplitEmptyFunction: true # Unknown to clang-format-4.0
+ #SplitEmptyRecord: true # Unknown to clang-format-4.0
+ #SplitEmptyNamespace: true # Unknown to clang-format-4.0
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+#BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0
+BreakBeforeTernaryOperators: false
+BreakConstructorInitializersBeforeComma: false
+#BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit: 80
+CommentPragmas: '^ IWYU pragma:'
+#CompactNamespaces: false # Unknown to clang-format-4.0
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 8
+ContinuationIndentWidth: 8
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+#FixNamespaceComments: false # Unknown to clang-format-4.0
+
+# Taken from:
+# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ \
+# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \
+# | sort | uniq
+ForEachMacros:
+ - 'apei_estatus_for_each_section'
+ - 'ata_for_each_dev'
+ - 'ata_for_each_link'
+ - 'ax25_for_each'
+ - 'ax25_uid_for_each'
+ - 'bio_for_each_integrity_vec'
+ - '__bio_for_each_segment'
+ - 'bio_for_each_segment'
+ - 'bio_for_each_segment_all'
+ - 'bio_list_for_each'
+ - 'bip_for_each_vec'
+ - 'blkg_for_each_descendant_post'
+ - 'blkg_for_each_descendant_pre'
+ - 'blk_queue_for_each_rl'
+ - 'bond_for_each_slave'
+ - 'bond_for_each_slave_rcu'
+ - 'btree_for_each_safe128'
+ - 'btree_for_each_safe32'
+ - 'btree_for_each_safe64'
+ - 'btree_for_each_safel'
+ - 'card_for_each_dev'
+ - 'cgroup_taskset_for_each'
+ - 'cgroup_taskset_for_each_leader'
+ - 'cpufreq_for_each_entry'
+ - 'cpufreq_for_each_entry_idx'
+ - 'cpufreq_for_each_valid_entry'
+ - 'cpufreq_for_each_valid_entry_idx'
+ - 'css_for_each_child'
+ - 'css_for_each_descendant_post'
+ - 'css_for_each_descendant_pre'
+ - 'device_for_each_child_node'
+ - 'drm_atomic_crtc_for_each_plane'
+ - 'drm_atomic_crtc_state_for_each_plane'
+ - 'drm_atomic_crtc_state_for_each_plane_state'
+ - 'drm_for_each_connector_iter'
+ - 'drm_for_each_crtc'
+ - 'drm_for_each_encoder'
+ - 'drm_for_each_encoder_mask'
+ - 'drm_for_each_fb'
+ - 'drm_for_each_legacy_plane'
+ - 'drm_for_each_plane'
+ - 'drm_for_each_plane_mask'
+ - 'drm_mm_for_each_hole'
+ - 'drm_mm_for_each_node'
+ - 'drm_mm_for_each_node_in_range'
+ - 'drm_mm_for_each_node_safe'
+ - 'for_each_active_drhd_unit'
+ - 'for_each_active_iommu'
+ - 'for_each_available_child_of_node'
+ - 'for_each_bio'
+ - 'for_each_board_func_rsrc'
+ - 'for_each_bvec'
+ - 'for_each_child_of_node'
+ - 'for_each_clear_bit'
+ - 'for_each_clear_bit_from'
+ - 'for_each_cmsghdr'
+ - 'for_each_compatible_node'
+ - 'for_each_console'
+ - 'for_each_cpu'
+ - 'for_each_cpu_and'
+ - 'for_each_cpu_not'
+ - 'for_each_cpu_wrap'
+ - 'for_each_dev_addr'
+ - 'for_each_dma_cap_mask'
+ - 'for_each_drhd_unit'
+ - 'for_each_dss_dev'
+ - 'for_each_efi_memory_desc'
+ - 'for_each_efi_memory_desc_in_map'
+ - 'for_each_endpoint_of_node'
+ - 'for_each_evictable_lru'
+ - 'for_each_fib6_node_rt_rcu'
+ - 'for_each_fib6_walker_rt'
+ - 'for_each_free_mem_range'
+ - 'for_each_free_mem_range_reverse'
+ - 'for_each_func_rsrc'
+ - 'for_each_hstate'
+ - 'for_each_if'
+ - 'for_each_iommu'
+ - 'for_each_ip_tunnel_rcu'
+ - 'for_each_irq_nr'
+ - 'for_each_lru'
+ - 'for_each_matching_node'
+ - 'for_each_matching_node_and_match'
+ - 'for_each_memblock'
+ - 'for_each_memblock_type'
+ - 'for_each_memcg_cache_index'
+ - 'for_each_mem_pfn_range'
+ - 'for_each_mem_range'
+ - 'for_each_mem_range_rev'
+ - 'for_each_migratetype_order'
+ - 'for_each_msi_entry'
+ - 'for_each_net'
+ - 'for_each_netdev'
+ - 'for_each_netdev_continue'
+ - 'for_each_netdev_continue_rcu'
+ - 'for_each_netdev_feature'
+ - 'for_each_netdev_in_bond_rcu'
+ - 'for_each_netdev_rcu'
+ - 'for_each_netdev_reverse'
+ - 'for_each_netdev_safe'
+ - 'for_each_net_rcu'
+ - 'for_each_new_connector_in_state'
+ - 'for_each_new_crtc_in_state'
+ - 'for_each_new_plane_in_state'
+ - 'for_each_new_private_obj_in_state'
+ - 'for_each_node'
+ - 'for_each_node_by_name'
+ - 'for_each_node_by_type'
+ - 'for_each_node_mask'
+ - 'for_each_node_state'
+ - 'for_each_node_with_cpus'
+ - 'for_each_node_with_property'
+ - 'for_each_of_allnodes'
+ - 'for_each_of_allnodes_from'
+ - 'for_each_of_pci_range'
+ - 'for_each_old_connector_in_state'
+ - 'for_each_old_crtc_in_state'
+ - 'for_each_oldnew_connector_in_state'
+ - 'for_each_oldnew_crtc_in_state'
+ - 'for_each_oldnew_plane_in_state'
+ - 'for_each_oldnew_private_obj_in_state'
+ - 'for_each_old_plane_in_state'
+ - 'for_each_old_private_obj_in_state'
+ - 'for_each_online_cpu'
+ - 'for_each_online_node'
+ - 'for_each_online_pgdat'
+ - 'for_each_pci_bridge'
+ - 'for_each_pci_dev'
+ - 'for_each_pci_msi_entry'
+ - 'for_each_populated_zone'
+ - 'for_each_possible_cpu'
+ - 'for_each_present_cpu'
+ - 'for_each_prime_number'
+ - 'for_each_prime_number_from'
+ - 'for_each_process'
+ - 'for_each_process_thread'
+ - 'for_each_property_of_node'
+ - 'for_each_reserved_mem_region'
+ - 'for_each_resv_unavail_range'
+ - 'for_each_rtdcom'
+ - 'for_each_rtdcom_safe'
+ - 'for_each_set_bit'
+ - 'for_each_set_bit_from'
+ - 'for_each_sg'
+ - 'for_each_sg_page'
+ - '__for_each_thread'
+ - 'for_each_thread'
+ - 'for_each_zone'
+ - 'for_each_zone_zonelist'
+ - 'for_each_zone_zonelist_nodemask'
+ - 'fwnode_for_each_available_child_node'
+ - 'fwnode_for_each_child_node'
+ - 'fwnode_graph_for_each_endpoint'
+ - 'gadget_for_each_ep'
+ - 'hash_for_each'
+ - 'hash_for_each_possible'
+ - 'hash_for_each_possible_rcu'
+ - 'hash_for_each_possible_rcu_notrace'
+ - 'hash_for_each_possible_safe'
+ - 'hash_for_each_rcu'
+ - 'hash_for_each_safe'
+ - 'hctx_for_each_ctx'
+ - 'hlist_bl_for_each_entry'
+ - 'hlist_bl_for_each_entry_rcu'
+ - 'hlist_bl_for_each_entry_safe'
+ - 'hlist_for_each'
+ - 'hlist_for_each_entry'
+ - 'hlist_for_each_entry_continue'
+ - 'hlist_for_each_entry_continue_rcu'
+ - 'hlist_for_each_entry_continue_rcu_bh'
+ - 'hlist_for_each_entry_from'
+ - 'hlist_for_each_entry_from_rcu'
+ - 'hlist_for_each_entry_rcu'
+ - 'hlist_for_each_entry_rcu_bh'
+ - 'hlist_for_each_entry_rcu_notrace'
+ - 'hlist_for_each_entry_safe'
+ - '__hlist_for_each_rcu'
+ - 'hlist_for_each_safe'
+ - 'hlist_nulls_for_each_entry'
+ - 'hlist_nulls_for_each_entry_from'
+ - 'hlist_nulls_for_each_entry_rcu'
+ - 'hlist_nulls_for_each_entry_safe'
+ - 'ide_host_for_each_port'
+ - 'ide_port_for_each_dev'
+ - 'ide_port_for_each_present_dev'
+ - 'idr_for_each_entry'
+ - 'idr_for_each_entry_continue'
+ - 'idr_for_each_entry_ul'
+ - 'inet_bind_bucket_for_each'
+ - 'inet_lhash2_for_each_icsk_rcu'
+ - 'iov_for_each'
+ - 'key_for_each'
+ - 'key_for_each_safe'
+ - 'klp_for_each_func'
+ - 'klp_for_each_object'
+ - 'kvm_for_each_memslot'
+ - 'kvm_for_each_vcpu'
+ - 'list_for_each'
+ - 'list_for_each_entry'
+ - 'list_for_each_entry_continue'
+ - 'list_for_each_entry_continue_rcu'
+ - 'list_for_each_entry_continue_reverse'
+ - 'list_for_each_entry_from'
+ - 'list_for_each_entry_from_reverse'
+ - 'list_for_each_entry_lockless'
+ - 'list_for_each_entry_rcu'
+ - 'list_for_each_entry_reverse'
+ - 'list_for_each_entry_safe'
+ - 'list_for_each_entry_safe_continue'
+ - 'list_for_each_entry_safe_from'
+ - 'list_for_each_entry_safe_reverse'
+ - 'list_for_each_prev'
+ - 'list_for_each_prev_safe'
+ - 'list_for_each_safe'
+ - 'llist_for_each'
+ - 'llist_for_each_entry'
+ - 'llist_for_each_entry_safe'
+ - 'llist_for_each_safe'
+ - 'media_device_for_each_entity'
+ - 'media_device_for_each_intf'
+ - 'media_device_for_each_link'
+ - 'media_device_for_each_pad'
+ - 'netdev_for_each_lower_dev'
+ - 'netdev_for_each_lower_private'
+ - 'netdev_for_each_lower_private_rcu'
+ - 'netdev_for_each_mc_addr'
+ - 'netdev_for_each_uc_addr'
+ - 'netdev_for_each_upper_dev_rcu'
+ - 'netdev_hw_addr_list_for_each'
+ - 'nft_rule_for_each_expr'
+ - 'nla_for_each_attr'
+ - 'nla_for_each_nested'
+ - 'nlmsg_for_each_attr'
+ - 'nlmsg_for_each_msg'
+ - 'nr_neigh_for_each'
+ - 'nr_neigh_for_each_safe'
+ - 'nr_node_for_each'
+ - 'nr_node_for_each_safe'
+ - 'of_for_each_phandle'
+ - 'of_property_for_each_string'
+ - 'of_property_for_each_u32'
+ - 'pci_bus_for_each_resource'
+ - 'ping_portaddr_for_each_entry'
+ - 'plist_for_each'
+ - 'plist_for_each_continue'
+ - 'plist_for_each_entry'
+ - 'plist_for_each_entry_continue'
+ - 'plist_for_each_entry_safe'
+ - 'plist_for_each_safe'
+ - 'pnp_for_each_card'
+ - 'pnp_for_each_dev'
+ - 'protocol_for_each_card'
+ - 'protocol_for_each_dev'
+ - 'queue_for_each_hw_ctx'
+ - 'radix_tree_for_each_contig'
+ - 'radix_tree_for_each_slot'
+ - 'radix_tree_for_each_tagged'
+ - 'rbtree_postorder_for_each_entry_safe'
+ - 'resource_list_for_each_entry'
+ - 'resource_list_for_each_entry_safe'
+ - 'rhl_for_each_entry_rcu'
+ - 'rhl_for_each_rcu'
+ - 'rht_for_each'
+ - 'rht_for_each_continue'
+ - 'rht_for_each_entry'
+ - 'rht_for_each_entry_continue'
+ - 'rht_for_each_entry_rcu'
+ - 'rht_for_each_entry_rcu_continue'
+ - 'rht_for_each_entry_safe'
+ - 'rht_for_each_rcu'
+ - 'rht_for_each_rcu_continue'
+ - '__rq_for_each_bio'
+ - 'rq_for_each_segment'
+ - 'scsi_for_each_prot_sg'
+ - 'scsi_for_each_sg'
+ - 'sctp_for_each_hentry'
+ - 'sctp_skb_for_each'
+ - 'shdma_for_each_chan'
+ - '__shost_for_each_device'
+ - 'shost_for_each_device'
+ - 'sk_for_each'
+ - 'sk_for_each_bound'
+ - 'sk_for_each_entry_offset_rcu'
+ - 'sk_for_each_from'
+ - 'sk_for_each_rcu'
+ - 'sk_for_each_safe'
+ - 'sk_nulls_for_each'
+ - 'sk_nulls_for_each_from'
+ - 'sk_nulls_for_each_rcu'
+ - 'snd_pcm_group_for_each_entry'
+ - 'snd_soc_dapm_widget_for_each_path'
+ - 'snd_soc_dapm_widget_for_each_path_safe'
+ - 'snd_soc_dapm_widget_for_each_sink_path'
+ - 'snd_soc_dapm_widget_for_each_source_path'
+ - 'tb_property_for_each'
+ - 'udp_portaddr_for_each_entry'
+ - 'udp_portaddr_for_each_entry_rcu'
+ - 'usb_hub_for_each_child'
+ - 'v4l2_device_for_each_subdev'
+ - 'v4l2_m2m_for_each_dst_buf'
+ - 'v4l2_m2m_for_each_dst_buf_safe'
+ - 'v4l2_m2m_for_each_src_buf'
+ - 'v4l2_m2m_for_each_src_buf_safe'
+ - 'zorro_for_each_dev'
+
+#IncludeBlocks: Preserve # Unknown to clang-format-5.0
+IncludeCategories:
+ - Regex: '.*'
+ Priority: 1
+IncludeIsMainRegex: '(Test)?$'
+IndentCaseLabels: false
+#IndentPPDirectives: None # Unknown to clang-format-5.0
+IndentWidth: 8
+IndentWrappedFunctionNames: true
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: Inner
+#ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0
+ObjCBlockIndentWidth: 8
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+
+# Taken from git's rules
+#PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0
+PenaltyBreakBeforeFirstCallParameter: 30
+PenaltyBreakComment: 10
+PenaltyBreakFirstLessLess: 0
+PenaltyBreakString: 10
+PenaltyExcessCharacter: 100
+PenaltyReturnTypeOnItsOwnLine: 60
+
+PointerAlignment: Right
+ReflowComments: false
+SortIncludes: false
+#SortUsingDeclarations: false # Unknown to clang-format-4.0
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+#SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0
+#SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0
+SpaceBeforeParens: ControlStatements
+#SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp03
+TabWidth: 8
+UseTab: Always
+...
!.gitignore
!.mailmap
!.cocciconfig
+!.clang-format
#
# Generated include files
2.6 Locking
lock_page_cgroup()/unlock_page_cgroup() should not be called under
- mapping->tree_lock.
+ the i_pages lock.
Other lock order is following:
PG_locked.
==================================================================
For details about OPP, see Documentation/power/opp.txt
-dev_pm_opp_init_cpufreq_table - cpufreq framework typically is initialized with
- cpufreq_table_validate_and_show() which is provided with the list of
- frequencies that are available for operation. This function provides
- a ready to use conversion routine to translate the OPP layer's internal
- information about the available frequencies into a format readily
- providable to cpufreq.
+dev_pm_opp_init_cpufreq_table -
+ This function provides a ready to use conversion routine to translate
+ the OPP layer's internal information about the available frequencies
+ into a format readily providable to cpufreq.
WARNING: Do not use this function in interrupt context.
/* Do things */
r = dev_pm_opp_init_cpufreq_table(dev, &freq_table);
if (!r)
- cpufreq_table_validate_and_show(policy, freq_table);
+ policy->freq_table = freq_table;
/* Do other things */
}
particular order, but if they are cpufreq core will do DVFS a bit
quickly for them as search for best match is faster.
-By calling cpufreq_table_validate_and_show(), the cpuinfo.min_freq and
-cpuinfo.max_freq values are detected, and policy->min and policy->max
-are set to the same values. This is helpful for the per-CPU
-initialization stage.
+The cpufreq table is verified automatically by the core if the policy contains a
+valid pointer in its policy->freq_table field.
cpufreq_frequency_table_verify() assures that at least one valid
frequency is within policy->min and policy->max, and all other criteria
-r--r--r-- 1 root root 4096 Feb 8 10:42 latency
-r--r--r-- 1 root root 4096 Feb 8 10:42 name
-r--r--r-- 1 root root 4096 Feb 8 10:42 power
+-r--r--r-- 1 root root 4096 Feb 8 10:42 residency
-r--r--r-- 1 root root 4096 Feb 8 10:42 time
-r--r--r-- 1 root root 4096 Feb 8 10:42 usage
-r--r--r-- 1 root root 4096 Feb 8 10:42 latency
-r--r--r-- 1 root root 4096 Feb 8 10:42 name
-r--r--r-- 1 root root 4096 Feb 8 10:42 power
+-r--r--r-- 1 root root 4096 Feb 8 10:42 residency
-r--r--r-- 1 root root 4096 Feb 8 10:42 time
-r--r--r-- 1 root root 4096 Feb 8 10:42 usage
-r--r--r-- 1 root root 4096 Feb 8 10:42 latency
-r--r--r-- 1 root root 4096 Feb 8 10:42 name
-r--r--r-- 1 root root 4096 Feb 8 10:42 power
+-r--r--r-- 1 root root 4096 Feb 8 10:42 residency
-r--r--r-- 1 root root 4096 Feb 8 10:42 time
-r--r--r-- 1 root root 4096 Feb 8 10:42 usage
-r--r--r-- 1 root root 4096 Feb 8 10:42 latency
-r--r--r-- 1 root root 4096 Feb 8 10:42 name
-r--r--r-- 1 root root 4096 Feb 8 10:42 power
+-r--r--r-- 1 root root 4096 Feb 8 10:42 residency
-r--r--r-- 1 root root 4096 Feb 8 10:42 time
-r--r--r-- 1 root root 4096 Feb 8 10:42 usage
--------------------------------------------------------------------------------
* desc : Small description about the idle state (string)
* disable : Option to disable this idle state (bool) -> see note below
* latency : Latency to exit out of this idle state (in microseconds)
+* residency : Time after which a state becomes more effecient than any
+ shallower state (in microseconds)
* name : Name of the idle state (string)
* power : Power consumed while in this idle state (in milliwatts)
* time : Total time spent in this idle state (in microseconds)
the device is compatible with the R-Car Gen2 VMSA-compatible IPMMU.
- "renesas,ipmmu-r8a73a4" for the R8A73A4 (R-Mobile APE6) IPMMU.
+ - "renesas,ipmmu-r8a7743" for the R8A7743 (RZ/G1M) IPMMU.
+ - "renesas,ipmmu-r8a7745" for the R8A7745 (RZ/G1E) IPMMU.
- "renesas,ipmmu-r8a7790" for the R8A7790 (R-Car H2) IPMMU.
- "renesas,ipmmu-r8a7791" for the R8A7791 (R-Car M2-W) IPMMU.
- "renesas,ipmmu-r8a7793" for the R8A7793 (R-Car M2-N) IPMMU.
- "renesas,ipmmu-r8a7796" for the R8A7796 (R-Car M3-W) IPMMU.
- "renesas,ipmmu-r8a77970" for the R8A77970 (R-Car V3M) IPMMU.
- "renesas,ipmmu-r8a77995" for the R8A77995 (R-Car D3) IPMMU.
- - "renesas,ipmmu-vmsa" for generic R-Car Gen2 VMSA-compatible IPMMU.
+ - "renesas,ipmmu-vmsa" for generic R-Car Gen2 or RZ/G1 VMSA-compatible
+ IPMMU.
- reg: Base address and size of the IPMMU registers.
- interrupts: Specifiers for the MMU fault interrupts. For instances that
"single-master" device, and needs no additional information
to associate with its master device. See:
Documentation/devicetree/bindings/iommu/iommu.txt
+- clocks : A list of clocks required for the IOMMU to be accessible by
+ the host CPU.
+- clock-names : Should contain the following:
+ "iface" - Main peripheral bus clock (PCLK/HCL) (required)
+ "aclk" - AXI bus clock (required)
Optional properties:
- rockchip,disable-mmu-reset : Don't use the mmu reset operation.
reg = <0xff940300 0x100>;
interrupts = <GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>;
interrupt-names = "vopl_mmu";
+ clocks = <&cru ACLK_VOP1>, <&cru HCLK_VOP1>;
+ clock-names = "aclk", "iface";
#iommu-cells = <0>;
};
- Proc filesystem.
- The cell database.
- Security.
- - Examples.
+ - The @sys substitution.
========
passed to a process that doesn't have that key (perhaps over an AF_UNIX
socket), then the operations on the file will be made with key that was used to
open the file.
+
+
+=====================
+THE @SYS SUBSTITUTION
+=====================
+
+The list of up to 16 @sys substitutions for the current network namespace can
+be configured by writing a list to /proc/fs/afs/sysname:
+
+ [root@andromeda ~]# echo foo amd64_linux_26 >/proc/fs/afs/sysname
+
+or cleared entirely by writing an empty list:
+
+ [root@andromeda ~]# echo >/proc/fs/afs/sysname
+
+The current list for current network namespace can be retrieved by:
+
+ [root@andromeda ~]# cat /proc/fs/afs/sysname
+ foo
+ amd64_linux_26
+
+When @sys is being substituted for, each element of the list is tried in the
+order given.
+
+By default, the list will contain one item that conforms to the pattern
+"<arch>_linux_26", amd64 being the name for x86_64.
Glock locking order within GFS2:
- 1. i_mutex (if required)
+ 1. i_rwsem (if required)
2. Rename glock (for rename only)
3. Inode glock(s)
(Parents before children, inodes at "same level" with same parent in
lock number order)
4. Rgrp glock(s) (for (de)allocation operations)
5. Transaction glock (via gfs2_trans_begin) for non-read operations
- 6. Page lock (always last, very important!)
+ 6. i_rw_mutex (if required)
+ 7. Page lock (always last, very important!)
There are two glocks per inode. One deals with access to the inode
itself (locking order as above), and the other, known as the iopen
style (a line which becomes far less readable if split to fit within the
80-column limit, for example), just do it.
+Note that you can also use the ``clang-format`` tool to help you with
+these rules, to quickly re-format parts of your code automatically,
+and to review full files in order to spot coding style mistakes,
+typos and possible improvements. It is also handy for sorting ``#includes``,
+for aligning variables/macros, for reflowing text and other similar tasks.
+See the file :ref:`Documentation/process/clang-format.rst <clangformat>`
+for more details.
+
Abstraction layers
******************
--- /dev/null
+.. _clangformat:
+
+clang-format
+============
+
+``clang-format`` is a tool to format C/C++/... code according to
+a set of rules and heuristics. Like most tools, it is not perfect
+nor covers every single case, but it is good enough to be helpful.
+
+``clang-format`` can be used for several purposes:
+
+ - Quickly reformat a block of code to the kernel style. Specially useful
+ when moving code around and aligning/sorting. See clangformatreformat_.
+
+ - Spot style mistakes, typos and possible improvements in files
+ you maintain, patches you review, diffs, etc. See clangformatreview_.
+
+ - Help you follow the coding style rules, specially useful for those
+ new to kernel development or working at the same time in several
+ projects with different coding styles.
+
+Its configuration file is ``.clang-format`` in the root of the kernel tree.
+The rules contained there try to approximate the most common kernel
+coding style. They also try to follow :ref:`Documentation/process/coding-style.rst <codingstyle>`
+as much as possible. Since not all the kernel follows the same style,
+it is possible that you may want to tweak the defaults for a particular
+subsystem or folder. To do so, you can override the defaults by writing
+another ``.clang-format`` file in a subfolder.
+
+The tool itself has already been included in the repositories of popular
+Linux distributions for a long time. Search for ``clang-format`` in
+your repositories. Otherwise, you can either download pre-built
+LLVM/clang binaries or build the source code from:
+
+ http://releases.llvm.org/download.html
+
+See more information about the tool at:
+
+ https://clang.llvm.org/docs/ClangFormat.html
+
+ https://clang.llvm.org/docs/ClangFormatStyleOptions.html
+
+
+.. _clangformatreview:
+
+Review files and patches for coding style
+-----------------------------------------
+
+By running the tool in its inline mode, you can review full subsystems,
+folders or individual files for code style mistakes, typos or improvements.
+
+To do so, you can run something like::
+
+ # Make sure your working directory is clean!
+ clang-format -i kernel/*.[ch]
+
+And then take a look at the git diff.
+
+Counting the lines of such a diff is also useful for improving/tweaking
+the style options in the configuration file; as well as testing new
+``clang-format`` features/versions.
+
+``clang-format`` also supports reading unified diffs, so you can review
+patches and git diffs easily. See the documentation at:
+
+ https://clang.llvm.org/docs/ClangFormat.html#script-for-patch-reformatting
+
+To avoid ``clang-format`` formatting some portion of a file, you can do::
+
+ int formatted_code;
+ // clang-format off
+ void unformatted_code ;
+ // clang-format on
+ void formatted_code_again;
+
+While it might be tempting to use this to keep a file always in sync with
+``clang-format``, specially if you are writing new files or if you are
+a maintainer, please note that people might be running different
+``clang-format`` versions or not have it available at all. Therefore,
+you should probably refrain yourself from using this in kernel sources;
+at least until we see if ``clang-format`` becomes commonplace.
+
+
+.. _clangformatreformat:
+
+Reformatting blocks of code
+---------------------------
+
+By using an integration with your text editor, you can reformat arbitrary
+blocks (selections) of code with a single keystroke. This is specially
+useful when moving code around, for complex code that is deeply intended,
+for multi-line macros (and aligning their backslashes), etc.
+
+Remember that you can always tweak the changes afterwards in those cases
+where the tool did not do an optimal job. But as a first approximation,
+it can be very useful.
+
+There are integrations for many popular text editors. For some of them,
+like vim, emacs, BBEdit and Visual Studio you can find support built-in.
+For instructions, read the appropiate section at:
+
+ https://clang.llvm.org/docs/ClangFormat.html
+
+For Atom, Eclipse, Sublime Text, Visual Studio Code, XCode and other
+editors and IDEs you should be able to find ready-to-use plugins.
+
+For this use case, consider using a secondary ``.clang-format``
+so that you can tweak a few options. See clangformatextra_.
+
+
+.. _clangformatmissing:
+
+Missing support
+---------------
+
+``clang-format`` is missing support for some things that are common
+in kernel code. They are easy to remember, so if you use the tool
+regularly, you will quickly learn to avoid/ignore those.
+
+In particular, some very common ones you will notice are:
+
+ - Aligned blocks of one-line ``#defines``, e.g.::
+
+ #define TRACING_MAP_BITS_DEFAULT 11
+ #define TRACING_MAP_BITS_MAX 17
+ #define TRACING_MAP_BITS_MIN 7
+
+ vs.::
+
+ #define TRACING_MAP_BITS_DEFAULT 11
+ #define TRACING_MAP_BITS_MAX 17
+ #define TRACING_MAP_BITS_MIN 7
+
+ - Aligned designated initializers, e.g.::
+
+ static const struct file_operations uprobe_events_ops = {
+ .owner = THIS_MODULE,
+ .open = probes_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = probes_write,
+ };
+
+ vs.::
+
+ static const struct file_operations uprobe_events_ops = {
+ .owner = THIS_MODULE,
+ .open = probes_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = probes_write,
+ };
+
+
+.. _clangformatextra:
+
+Extra features/options
+----------------------
+
+Some features/style options are not enabled by default in the configuration
+file in order to minimize the differences between the output and the current
+code. In other words, to make the difference as small as possible,
+which makes reviewing full-file style, as well diffs and patches as easy
+as possible.
+
+In other cases (e.g. particular subsystems/folders/files), the kernel style
+might be different and enabling some of these options may approximate
+better the style there.
+
+For instance:
+
+ - Aligning assignments (``AlignConsecutiveAssignments``).
+
+ - Aligning declarations (``AlignConsecutiveDeclarations``).
+
+ - Reflowing text in comments (``ReflowComments``).
+
+ - Sorting ``#includes`` (``SortIncludes``).
+
+They are typically useful for block re-formatting, rather than full-file.
+You might want to create another ``.clang-format`` file and use that one
+from your editor/IDE instead.
re-formatting you may want to take a look at the man page. But
remember: ``indent`` is not a fix for bad programming.
+Note that you can also use the ``clang-format`` tool to help you with
+these rules, to quickly re-format parts of your code automatically,
+and to review full files in order to spot coding style mistakes,
+typos and possible improvements. It is also handy for sorting ``#includes``,
+for aligning variables/macros, for reflowing text and other similar tasks.
+See the file :ref:`Documentation/process/clang-format.rst <clangformat>`
+for more details.
+
10) Kconfig configuration files
-------------------------------
tainted:
-Non-zero if the kernel has been tainted. Numeric values, which
-can be ORed together:
-
- 1 - A module with a non-GPL license has been loaded, this
- includes modules with no license.
- Set by modutils >= 2.4.9 and module-init-tools.
- 2 - A module was force loaded by insmod -f.
- Set by modutils >= 2.4.9 and module-init-tools.
- 4 - Unsafe SMP processors: SMP with CPUs not designed for SMP.
- 8 - A module was forcibly unloaded from the system by rmmod -f.
- 16 - A hardware machine check error occurred on the system.
- 32 - A bad page was discovered on the system.
- 64 - The user has asked that the system be marked "tainted". This
- could be because they are running software that directly modifies
- the hardware, or for other reasons.
- 128 - The system has died.
- 256 - The ACPI DSDT has been overridden with one supplied by the user
- instead of using the one provided by the hardware.
- 512 - A kernel warning has occurred.
-1024 - A module from drivers/staging was loaded.
-2048 - The system is working around a severe firmware bug.
-4096 - An out-of-tree module has been loaded.
-8192 - An unsigned module has been loaded in a kernel supporting module
- signature.
-16384 - A soft lockup has previously occurred on the system.
-32768 - The kernel has been live patched.
+Non-zero if the kernel has been tainted. Numeric values, which can be
+ORed together. The letters are seen in "Tainted" line of Oops reports.
+
+ 1 (P): A module with a non-GPL license has been loaded, this
+ includes modules with no license.
+ Set by modutils >= 2.4.9 and module-init-tools.
+ 2 (F): A module was force loaded by insmod -f.
+ Set by modutils >= 2.4.9 and module-init-tools.
+ 4 (S): Unsafe SMP processors: SMP with CPUs not designed for SMP.
+ 8 (R): A module was forcibly unloaded from the system by rmmod -f.
+ 16 (M): A hardware machine check error occurred on the system.
+ 32 (B): A bad page was discovered on the system.
+ 64 (U): The user has asked that the system be marked "tainted". This
+ could be because they are running software that directly modifies
+ the hardware, or for other reasons.
+ 128 (D): The system has died.
+ 256 (A): The ACPI DSDT has been overridden with one supplied by the user
+ instead of using the one provided by the hardware.
+ 512 (W): A kernel warning has occurred.
+ 1024 (C): A module from drivers/staging was loaded.
+ 2048 (I): The system is working around a severe firmware bug.
+ 4096 (O): An out-of-tree module has been loaded.
+ 8192 (E): An unsigned module has been loaded in a kernel supporting module
+ signature.
+ 16384 (L): A soft lockup has previously occurred on the system.
+ 32768 (K): The kernel has been live patched.
+ 65536 (X): Auxiliary taint, defined and used by for distros.
+131072 (T): The kernel was built with the struct randomization plugin.
==============================================================
% cat /proc/sys/vm/lowmem_reserve_ratio
256 256 32
-
-Note: # of this elements is one fewer than number of zones. Because the highest
- zone's value is not necessary for following calculation.
But, these values are not used directly. The kernel calculates # of protection
pages for each zones from them. These are shown as array of protection pages
pages of higher zones on the node.
If you would like to protect more pages, smaller values are effective.
-The minimum value is 1 (1/1 -> 100%).
+The minimum value is 1 (1/1 -> 100%). The value less than 1 completely
+disables protection of the pages.
==============================================================
Heterogeneous Memory Management (HMM)
-Transparently allow any component of a program to use any memory region of said
-program with a device without using device specific memory allocator. This is
-becoming a requirement to simplify the use of advance heterogeneous computing
-where GPU, DSP or FPGA are use to perform various computations.
-
-This document is divided as follow, in the first section i expose the problems
-related to the use of a device specific allocator. The second section i expose
-the hardware limitations that are inherent to many platforms. The third section
-gives an overview of HMM designs. The fourth section explains how CPU page-
-table mirroring works and what is HMM purpose in this context. Fifth section
-deals with how device memory is represented inside the kernel. Finaly the last
-section present the new migration helper that allow to leverage the device DMA
-engine.
-
-
-1) Problems of using device specific memory allocator:
-2) System bus, device memory characteristics
-3) Share address space and migration
+Provide infrastructure and helpers to integrate non-conventional memory (device
+memory like GPU on board memory) into regular kernel path, with the cornerstone
+of this being specialized struct page for such memory (see sections 5 to 7 of
+this document).
+
+HMM also provides optional helpers for SVM (Share Virtual Memory), i.e.,
+allowing a device to transparently access program address coherently with the
+CPU meaning that any valid pointer on the CPU is also a valid pointer for the
+device. This is becoming mandatory to simplify the use of advanced hetero-
+geneous computing where GPU, DSP, or FPGA are used to perform various
+computations on behalf of a process.
+
+This document is divided as follows: in the first section I expose the problems
+related to using device specific memory allocators. In the second section, I
+expose the hardware limitations that are inherent to many platforms. The third
+section gives an overview of the HMM design. The fourth section explains how
+CPU page-table mirroring works and the purpose of HMM in this context. The
+fifth section deals with how device memory is represented inside the kernel.
+Finally, the last section presents a new migration helper that allows lever-
+aging the device DMA engine.
+
+
+1) Problems of using a device specific memory allocator:
+2) I/O bus, device memory characteristics
+3) Shared address space and migration
4) Address space mirroring implementation and API
5) Represent and manage device memory from core kernel point of view
-6) Migrate to and from device memory
+6) Migration to and from device memory
7) Memory cgroup (memcg) and rss accounting
-------------------------------------------------------------------------------
-1) Problems of using device specific memory allocator:
-
-Device with large amount of on board memory (several giga bytes) like GPU have
-historically manage their memory through dedicated driver specific API. This
-creates a disconnect between memory allocated and managed by device driver and
-regular application memory (private anonymous, share memory or regular file
-back memory). From here on i will refer to this aspect as split address space.
-I use share address space to refer to the opposite situation ie one in which
-any memory region can be use by device transparently.
-
-Split address space because device can only access memory allocated through the
-device specific API. This imply that all memory object in a program are not
-equal from device point of view which complicate large program that rely on a
-wide set of libraries.
-
-Concretly this means that code that wants to leverage device like GPU need to
-copy object between genericly allocated memory (malloc, mmap private/share/)
-and memory allocated through the device driver API (this still end up with an
-mmap but of the device file).
-
-For flat dataset (array, grid, image, ...) this isn't too hard to achieve but
-complex data-set (list, tree, ...) are hard to get right. Duplicating a complex
-data-set need to re-map all the pointer relations between each of its elements.
-This is error prone and program gets harder to debug because of the duplicate
-data-set.
-
-Split address space also means that library can not transparently use data they
-are getting from core program or other library and thus each library might have
-to duplicate its input data-set using specific memory allocator. Large project
-suffer from this and waste resources because of the various memory copy.
-
-Duplicating each library API to accept as input or output memory allocted by
+1) Problems of using a device specific memory allocator:
+
+Devices with a large amount of on board memory (several gigabytes) like GPUs
+have historically managed their memory through dedicated driver specific APIs.
+This creates a disconnect between memory allocated and managed by a device
+driver and regular application memory (private anonymous, shared memory, or
+regular file backed memory). From here on I will refer to this aspect as split
+address space. I use shared address space to refer to the opposite situation:
+i.e., one in which any application memory region can be used by a device
+transparently.
+
+Split address space happens because device can only access memory allocated
+through device specific API. This implies that all memory objects in a program
+are not equal from the device point of view which complicates large programs
+that rely on a wide set of libraries.
+
+Concretely this means that code that wants to leverage devices like GPUs needs
+to copy object between generically allocated memory (malloc, mmap private, mmap
+share) and memory allocated through the device driver API (this still ends up
+with an mmap but of the device file).
+
+For flat data sets (array, grid, image, ...) this isn't too hard to achieve but
+complex data sets (list, tree, ...) are hard to get right. Duplicating a
+complex data set needs to re-map all the pointer relations between each of its
+elements. This is error prone and program gets harder to debug because of the
+duplicate data set and addresses.
+
+Split address space also means that libraries cannot transparently use data
+they are getting from the core program or another library and thus each library
+might have to duplicate its input data set using the device specific memory
+allocator. Large projects suffer from this and waste resources because of the
+various memory copies.
+
+Duplicating each library API to accept as input or output memory allocated by
each device specific allocator is not a viable option. It would lead to a
-combinatorial explosions in the library entry points.
+combinatorial explosion in the library entry points.
-Finaly with the advance of high level language constructs (in C++ but in other
-language too) it is now possible for compiler to leverage GPU or other devices
-without even the programmer knowledge. Some of compiler identified patterns are
-only do-able with a share address. It is as well more reasonable to use a share
-address space for all the other patterns.
+Finally, with the advance of high level language constructs (in C++ but in
+other languages too) it is now possible for the compiler to leverage GPUs and
+other devices without programmer knowledge. Some compiler identified patterns
+are only do-able with a shared address space. It is also more reasonable to use
+a shared address space for all other patterns.
-------------------------------------------------------------------------------
-2) System bus, device memory characteristics
+2) I/O bus, device memory characteristics
-System bus cripple share address due to few limitations. Most system bus only
-allow basic memory access from device to main memory, even cache coherency is
-often optional. Access to device memory from CPU is even more limited, most
-often than not it is not cache coherent.
+I/O buses cripple shared address spaces due to a few limitations. Most I/O
+buses only allow basic memory access from device to main memory; even cache
+coherency is often optional. Access to device memory from CPU is even more
+limited. More often than not, it is not cache coherent.
-If we only consider the PCIE bus than device can access main memory (often
-through an IOMMU) and be cache coherent with the CPUs. However it only allows
-a limited set of atomic operation from device on main memory. This is worse
-in the other direction the CPUs can only access a limited range of the device
-memory and can not perform atomic operations on it. Thus device memory can not
-be consider like regular memory from kernel point of view.
+If we only consider the PCIE bus, then a device can access main memory (often
+through an IOMMU) and be cache coherent with the CPUs. However, it only allows
+a limited set of atomic operations from device on main memory. This is worse
+in the other direction: the CPU can only access a limited range of the device
+memory and cannot perform atomic operations on it. Thus device memory cannot
+be considered the same as regular memory from the kernel point of view.
Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0
-and 16 lanes). This is 33 times less that fastest GPU memory (1 TBytes/s).
-The final limitation is latency, access to main memory from the device has an
-order of magnitude higher latency than when the device access its own memory.
+and 16 lanes). This is 33 times less than the fastest GPU memory (1 TBytes/s).
+The final limitation is latency. Access to main memory from the device has an
+order of magnitude higher latency than when the device accesses its own memory.
-Some platform are developing new system bus or additions/modifications to PCIE
-to address some of those limitations (OpenCAPI, CCIX). They mainly allow two
+Some platforms are developing new I/O buses or additions/modifications to PCIE
+to address some of these limitations (OpenCAPI, CCIX). They mainly allow two-
way cache coherency between CPU and device and allow all atomic operations the
-architecture supports. Saddly not all platform are following this trends and
-some major architecture are left without hardware solutions to those problems.
+architecture supports. Sadly, not all platforms are following this trend and
+some major architectures are left without hardware solutions to these problems.
-So for share address space to make sense not only we must allow device to
-access any memory memory but we must also permit any memory to be migrated to
-device memory while device is using it (blocking CPU access while it happens).
+So for shared address space to make sense, not only must we allow devices to
+access any memory but we must also permit any memory to be migrated to device
+memory while device is using it (blocking CPU access while it happens).
-------------------------------------------------------------------------------
-3) Share address space and migration
+3) Shared address space and migration
HMM intends to provide two main features. First one is to share the address
-space by duplication the CPU page table into the device page table so same
-address point to same memory and this for any valid main memory address in
+space by duplicating the CPU page table in the device page table so the same
+address points to the same physical memory for any valid main memory address in
the process address space.
-To achieve this, HMM offer a set of helpers to populate the device page table
+To achieve this, HMM offers a set of helpers to populate the device page table
while keeping track of CPU page table updates. Device page table updates are
-not as easy as CPU page table updates. To update the device page table you must
-allow a buffer (or use a pool of pre-allocated buffer) and write GPU specifics
-commands in it to perform the update (unmap, cache invalidations and flush,
-...). This can not be done through common code for all device. Hence why HMM
-provides helpers to factor out everything that can be while leaving the gory
-details to the device driver.
-
-The second mechanism HMM provide is a new kind of ZONE_DEVICE memory that does
-allow to allocate a struct page for each page of the device memory. Those page
-are special because the CPU can not map them. They however allow to migrate
-main memory to device memory using exhisting migration mechanism and everything
-looks like if page was swap out to disk from CPU point of view. Using a struct
-page gives the easiest and cleanest integration with existing mm mechanisms.
-Again here HMM only provide helpers, first to hotplug new ZONE_DEVICE memory
-for the device memory and second to perform migration. Policy decision of what
-and when to migrate things is left to the device driver.
-
-Note that any CPU access to a device page trigger a page fault and a migration
-back to main memory ie when a page backing an given address A is migrated from
-a main memory page to a device page then any CPU access to address A trigger a
-page fault and initiate a migration back to main memory.
-
-
-With this two features, HMM not only allow a device to mirror a process address
-space and keeps both CPU and device page table synchronize, but also allow to
-leverage device memory by migrating part of data-set that is actively use by a
-device.
+not as easy as CPU page table updates. To update the device page table, you must
+allocate a buffer (or use a pool of pre-allocated buffers) and write GPU
+specific commands in it to perform the update (unmap, cache invalidations, and
+flush, ...). This cannot be done through common code for all devices. Hence
+why HMM provides helpers to factor out everything that can be while leaving the
+hardware specific details to the device driver.
+
+The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that
+allows allocating a struct page for each page of the device memory. Those pages
+are special because the CPU cannot map them. However, they allow migrating
+main memory to device memory using existing migration mechanisms and everything
+looks like a page is swapped out to disk from the CPU point of view. Using a
+struct page gives the easiest and cleanest integration with existing mm mech-
+anisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE
+memory for the device memory and second to perform migration. Policy decisions
+of what and when to migrate things is left to the device driver.
+
+Note that any CPU access to a device page triggers a page fault and a migration
+back to main memory. For example, when a page backing a given CPU address A is
+migrated from a main memory page to a device page, then any CPU access to
+address A triggers a page fault and initiates a migration back to main memory.
+
+With these two features, HMM not only allows a device to mirror process address
+space and keeping both CPU and device page table synchronized, but also lever-
+ages device memory by migrating the part of the data set that is actively being
+used by the device.
-------------------------------------------------------------------------------
4) Address space mirroring implementation and API
-Address space mirroring main objective is to allow to duplicate range of CPU
-page table into a device page table and HMM helps keeping both synchronize. A
-device driver that want to mirror a process address space must start with the
+Address space mirroring's main objective is to allow duplication of a range of
+CPU page table into a device page table; HMM helps keep both synchronized. A
+device driver that wants to mirror a process address space must start with the
registration of an hmm_mirror struct:
int hmm_mirror_register(struct hmm_mirror *mirror,
int hmm_mirror_register_locked(struct hmm_mirror *mirror,
struct mm_struct *mm);
-The locked variant is to be use when the driver is already holding the mmap_sem
-of the mm in write mode. The mirror struct has a set of callback that are use
-to propagate CPU page table:
+The locked variant is to be used when the driver is already holding mmap_sem
+of the mm in write mode. The mirror struct has a set of callbacks that are used
+to propagate CPU page tables:
struct hmm_mirror_ops {
/* sync_cpu_device_pagetables() - synchronize page tables
unsigned long end);
};
-Device driver must perform update to the range following action (turn range
-read only, or fully unmap, ...). Once driver callback returns the device must
-be done with the update.
+The device driver must perform the update action to the range (mark range
+read only, or fully unmap, ...). The device must be done with the update before
+the driver callback returns.
-When device driver wants to populate a range of virtual address it can use
-either:
+When the device driver wants to populate a range of virtual addresses, it can
+use either:
int hmm_vma_get_pfns(struct vm_area_struct *vma,
struct hmm_range *range,
unsigned long start,
bool write,
bool block);
-First one (hmm_vma_get_pfns()) will only fetch present CPU page table entry and
-will not trigger a page fault on missing or non present entry. The second one
-do trigger page fault on missing or read only entry if write parameter is true.
-Page fault use the generic mm page fault code path just like a CPU page fault.
+The first one (hmm_vma_get_pfns()) will only fetch present CPU page table
+entries and will not trigger a page fault on missing or non-present entries.
+The second one does trigger a page fault on missing or read-only entry if the
+write parameter is true. Page faults use the generic mm page fault code path
+just like a CPU page fault.
-Both function copy CPU page table into their pfns array argument. Each entry in
-that array correspond to an address in the virtual range. HMM provide a set of
-flags to help driver identify special CPU page table entries.
+Both functions copy CPU page table entries into their pfns array argument. Each
+entry in that array corresponds to an address in the virtual range. HMM
+provides a set of flags to help the driver identify special CPU page table
+entries.
Locking with the update() callback is the most important aspect the driver must
-respect in order to keep things properly synchronize. The usage pattern is :
+respect in order to keep things properly synchronized. The usage pattern is:
int driver_populate_range(...)
{
return 0;
}
-The driver->update lock is the same lock that driver takes inside its update()
-callback. That lock must be call before hmm_vma_range_done() to avoid any race
-with a concurrent CPU page table update.
+The driver->update lock is the same lock that the driver takes inside its
+update() callback. That lock must be held before hmm_vma_range_done() to avoid
+any race with a concurrent CPU page table update.
-HMM implements all this on top of the mmu_notifier API because we wanted to a
-simpler API and also to be able to perform optimization latter own like doing
-concurrent device update in multi-devices scenario.
+HMM implements all this on top of the mmu_notifier API because we wanted a
+simpler API and also to be able to perform optimizations latter on like doing
+concurrent device updates in multi-devices scenario.
-HMM also serve as an impedence missmatch between how CPU page table update are
-done (by CPU write to the page table and TLB flushes) from how device update
-their own page table. Device update is a multi-step process, first appropriate
-commands are write to a buffer, then this buffer is schedule for execution on
-the device. It is only once the device has executed commands in the buffer that
-the update is done. Creating and scheduling update command buffer can happen
-concurrently for multiple devices. Waiting for each device to report commands
-as executed is serialize (there is no point in doing this concurrently).
+HMM also serves as an impedance mismatch between how CPU page table updates
+are done (by CPU write to the page table and TLB flushes) and how devices
+update their own page table. Device updates are a multi-step process. First,
+appropriate commands are written to a buffer, then this buffer is scheduled for
+execution on the device. It is only once the device has executed commands in
+the buffer that the update is done. Creating and scheduling the update command
+buffer can happen concurrently for multiple devices. Waiting for each device to
+report commands as executed is serialized (there is no point in doing this
+concurrently).
-------------------------------------------------------------------------------
5) Represent and manage device memory from core kernel point of view
-Several differents design were try to support device memory. First one use
-device specific data structure to keep information about migrated memory and
-HMM hooked itself in various place of mm code to handle any access to address
-that were back by device memory. It turns out that this ended up replicating
-most of the fields of struct page and also needed many kernel code path to be
-updated to understand this new kind of memory.
+Several different designs were tried to support device memory. First one used
+a device specific data structure to keep information about migrated memory and
+HMM hooked itself in various places of mm code to handle any access to
+addresses that were backed by device memory. It turns out that this ended up
+replicating most of the fields of struct page and also needed many kernel code
+paths to be updated to understand this new kind of memory.
-Thing is most kernel code path never try to access the memory behind a page
-but only care about struct page contents. Because of this HMM switchted to
-directly using struct page for device memory which left most kernel code path
-un-aware of the difference. We only need to make sure that no one ever try to
-map those page from the CPU side.
+Most kernel code paths never try to access the memory behind a page
+but only care about struct page contents. Because of this, HMM switched to
+directly using struct page for device memory which left most kernel code paths
+unaware of the difference. We only need to make sure that no one ever tries to
+map those pages from the CPU side.
-HMM provide a set of helpers to register and hotplug device memory as a new
-region needing struct page. This is offer through a very simple API:
+HMM provides a set of helpers to register and hotplug device memory as a new
+region needing a struct page. This is offered through a very simple API:
struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
struct device *device,
};
The first callback (free()) happens when the last reference on a device page is
-drop. This means the device page is now free and no longer use by anyone. The
-second callback happens whenever CPU try to access a device page which it can
-not do. This second callback must trigger a migration back to system memory.
+dropped. This means the device page is now free and no longer used by anyone.
+The second callback happens whenever the CPU tries to access a device page
+which it cannot do. This second callback must trigger a migration back to
+system memory.
-------------------------------------------------------------------------------
-6) Migrate to and from device memory
+6) Migration to and from device memory
-Because CPU can not access device memory, migration must use device DMA engine
-to perform copy from and to device memory. For this we need a new migration
-helper:
+Because the CPU cannot access device memory, migration must use the device DMA
+engine to perform copy from and to device memory. For this we need a new
+migration helper:
int migrate_vma(const struct migrate_vma_ops *ops,
struct vm_area_struct *vma,
unsigned long *dst,
void *private);
-Unlike other migration function it works on a range of virtual address, there
-is two reasons for that. First device DMA copy has a high setup overhead cost
+Unlike other migration functions it works on a range of virtual address, there
+are two reasons for that. First, device DMA copy has a high setup overhead cost
and thus batching multiple pages is needed as otherwise the migration overhead
-make the whole excersie pointless. The second reason is because driver trigger
-such migration base on range of address the device is actively accessing.
+makes the whole exercise pointless. The second reason is because the
+migration might be for a range of addresses the device is actively accessing.
-The migrate_vma_ops struct define two callbacks. First one (alloc_and_copy())
-control destination memory allocation and copy operation. Second one is there
-to allow device driver to perform cleanup operation after migration.
+The migrate_vma_ops struct defines two callbacks. First one (alloc_and_copy())
+controls destination memory allocation and copy operation. Second one is there
+to allow the device driver to perform cleanup operations after migration.
struct migrate_vma_ops {
void (*alloc_and_copy)(struct vm_area_struct *vma,
void *private);
};
-It is important to stress that this migration helpers allow for hole in the
+It is important to stress that these migration helpers allow for holes in the
virtual address range. Some pages in the range might not be migrated for all
-the usual reasons (page is pin, page is lock, ...). This helper does not fail
-but just skip over those pages.
+the usual reasons (page is pinned, page is locked, ...). This helper does not
+fail but just skips over those pages.
-The alloc_and_copy() might as well decide to not migrate all pages in the
-range (for reasons under the callback control). For those the callback just
-have to leave the corresponding dst entry empty.
+The alloc_and_copy() might decide to not migrate all pages in the
+range (for reasons under the callback control). For those, the callback just
+has to leave the corresponding dst entry empty.
-Finaly the migration of the struct page might fails (for file back page) for
+Finally, the migration of the struct page might fail (for file backed page) for
various reasons (failure to freeze reference, or update page cache, ...). If
-that happens then the finalize_and_map() can catch any pages that was not
-migrated. Note those page were still copied to new page and thus we wasted
+that happens, then the finalize_and_map() can catch any pages that were not
+migrated. Note those pages were still copied to a new page and thus we wasted
bandwidth but this is considered as a rare event and a price that we are
willing to pay to keep all the code simpler.
7) Memory cgroup (memcg) and rss accounting
For now device memory is accounted as any regular page in rss counters (either
-anonymous if device page is use for anonymous, file if device page is use for
-file back page or shmem if device page is use for share memory). This is a
-deliberate choice to keep existing application that might start using device
-memory without knowing about it to keep runing unimpacted.
-
-Drawbacks is that OOM killer might kill an application using a lot of device
-memory and not a lot of regular system memory and thus not freeing much system
-memory. We want to gather more real world experience on how application and
-system react under memory pressure in the presence of device memory before
+anonymous if device page is used for anonymous, file if device page is used for
+file backed page or shmem if device page is used for shared memory). This is a
+deliberate choice to keep existing applications, that might start using device
+memory without knowing about it, running unimpacted.
+
+A drawback is that the OOM killer might kill an application using a lot of
+device memory and not a lot of regular system memory and thus not freeing much
+system memory. We want to gather more real world experience on how applications
+and system react under memory pressure in the presence of device memory before
deciding to account device memory differently.
-Same decision was made for memory cgroup. Device memory page are accounted
+Same decision was made for memory cgroup. Device memory pages are accounted
against same memory cgroup a regular page would be accounted to. This does
simplify migration to and from device memory. This also means that migration
-back from device memory to regular memory can not fail because it would
+back from device memory to regular memory cannot fail because it would
go above memory cgroup limit. We might revisit this choice latter on once we
-get more experience in how device memory is use and its impact on memory
+get more experience in how device memory is used and its impact on memory
resource control.
-Note that device memory can never be pin nor by device driver nor through GUP
+Note that device memory can never be pinned by device driver nor through GUP
and thus such memory is always free upon process exit. Or when last reference
-is drop in case of share memory or file back memory.
+is dropped in case of shared memory or file backed memory.
1. Lock the page to be migrated
-2. Insure that writeback is complete.
+2. Ensure that writeback is complete.
3. Lock the new page that we want to move to. It is locked so that accesses to
this (not yet uptodate) page immediately lock while the move is in progress.
mapcount is not zero then we do not migrate the page. All user space
processes that attempt to access the page will now wait on the page lock.
-5. The radix tree lock is taken. This will cause all processes trying
- to access the page via the mapping to block on the radix tree spinlock.
+5. The i_pages lock is taken. This will cause all processes trying
+ to access the page via the mapping to block on the spinlock.
6. The refcount of the page is examined and we back out if references remain
otherwise we know that we are the only one referencing this page.
9. The radix tree is changed to point to the new page.
-10. The reference count of the old page is dropped because the radix tree
+10. The reference count of the old page is dropped because the address space
reference is gone. A reference to the new page is established because
- the new page is referenced to by the radix tree.
+ the new page is referenced by the address space.
-11. The radix tree lock is dropped. With that lookups in the mapping
- become possible again. Processes will move from spinning on the tree_lock
+11. The i_pages lock is dropped. With that lookups in the mapping
+ become possible again. Processes will move from spinning on the lock
to sleeping on the locked new page.
12. The page contents are copied to the new page.
ARM/ASPEED MACHINE SUPPORT
M: Joel Stanley <joel@jms.id.au>
-S: Maintained
+R: Andrew Jeffery <andrew@aj.id.au>
+L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
+L: linux-aspeed@lists.ozlabs.org (moderated for non-subscribers)
+Q: https://patchwork.ozlabs.org/project/linux-aspeed/list/
+S: Supported
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/joel/aspeed.git
F: arch/arm/mach-aspeed/
F: arch/arm/boot/dts/aspeed-*
-F: drivers/*/*aspeed*
+N: aspeed
ARM/ATMEL AT91 Clock Support
M: Boris Brezillon <boris.brezillon@bootlin.com>
ARM/OXNAS platform support
M: Neil Armstrong <narmstrong@baylibre.com>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
-L: linux-oxnas@lists.tuxfamily.org (moderated for non-subscribers)
+L: linux-oxnas@groups.io (moderated for non-subscribers)
S: Maintained
F: arch/arm/mach-oxnas/
F: arch/arm/boot/dts/ox8*.dts*
F: drivers/staging/fsl-dpaa2/ethsw
DPT_I2O SCSI RAID DRIVER
-M: Adaptec OEM Raid Solutions <aacraid@adaptec.com>
+M: Adaptec OEM Raid Solutions <aacraid@microsemi.com>
L: linux-scsi@vger.kernel.org
W: http://www.adaptec.com/
S: Maintained
S: Maintained
F: mm/hmm*
F: include/linux/hmm*
+F: Documentation/vm/hmm.txt
HOST AP DRIVER
M: Jouni Malinen <j@w1.fi>
F: include/uapi/linux/ipmi*
IPS SCSI RAID DRIVER
-M: Adaptec OEM Raid Solutions <aacraid@adaptec.com>
+M: Adaptec OEM Raid Solutions <aacraid@microsemi.com>
L: linux-scsi@vger.kernel.org
W: http://www.adaptec.com/
S: Maintained
RAPIDIO SUBSYSTEM
M: Matt Porter <mporter@kernel.crashing.org>
-M: Alexandre Bounine <alexandre.bounine@idt.com>
+M: Alexandre Bounine <alex.bou9@gmail.com>
S: Maintained
F: drivers/rapidio/
#define MAP_NONBLOCK 0x40000 /* do not block on IO */
#define MAP_STACK 0x80000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x100000 /* create a huge page mapping */
+#define MAP_FIXED_NOREPLACE 0x200000/* MAP_FIXED which doesn't unmap underlying mapping */
#define MS_ASYNC 1 /* sync memory asynchronously */
#define MS_SYNC 2 /* synchronous memory sync */
#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
extern void flush_kernel_dcache_page(struct page *);
-#define flush_dcache_mmap_lock(mapping) \
- spin_lock_irq(&(mapping)->tree_lock)
-#define flush_dcache_mmap_unlock(mapping) \
- spin_unlock_irq(&(mapping)->tree_lock)
+#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages)
+#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages)
#define flush_icache_user_range(vma,page,addr,len) \
flush_dcache_page(page)
#include <mach/memory.h>
#endif
-/*
- * Allow for constants defined here to be used from assembly code
- * by prepending the UL suffix only with actual C code compilation.
- */
-#define UL(x) _AC(x, UL)
-
/* PAGE_OFFSET - the virtual address of the start of the kernel image */
#define PAGE_OFFSET UL(CONFIG_PAGE_OFFSET)
DT_MACHINE_START(NPCM7XX_DT, "NPCM7XX Chip family")
.atag_offset = 0x100,
.dt_compat = npcm7xx_dt_match,
+ .l2c_aux_val = 0x0,
+ .l2c_aux_mask = ~0x0,
MACHINE_END
void __init dma_contiguous_remap(void)
{
int i;
+
+ if (!dma_mmu_remap_num)
+ return;
+
+ /* call flush_cache_all() since CMA area would be large enough */
+ flush_cache_all();
for (i = 0; i < dma_mmu_remap_num; i++) {
phys_addr_t start = dma_mmu_remap[i].base;
phys_addr_t end = start + dma_mmu_remap[i].size;
flush_tlb_kernel_range(__phys_to_virt(start),
__phys_to_virt(end));
- iotable_init(&map, 1);
+ /*
+ * All the memory in CMA region will be on ZONE_MOVABLE.
+ * If that zone is considered as highmem, the memory in CMA
+ * region is also considered as highmem even if it's
+ * physical address belong to lowmem. In this case,
+ * re-mapping isn't required.
+ */
+ if (!is_highmem_idx(ZONE_MOVABLE))
+ iotable_init(&map, 1);
}
}
#define MIN_GAP (128*1024*1024UL)
#define MAX_GAP ((TASK_SIZE)/6*5)
-static int mmap_is_legacy(void)
+static int mmap_is_legacy(struct rlimit *rlim_stack)
{
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
- if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
+ if (rlim_stack->rlim_cur == RLIM_INFINITY)
return 1;
return sysctl_legacy_va_layout;
}
-static unsigned long mmap_base(unsigned long rnd)
+static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
{
- unsigned long gap = rlimit(RLIMIT_STACK);
+ unsigned long gap = rlim_stack->rlim_cur;
if (gap < MIN_GAP)
gap = MIN_GAP;
return rnd << PAGE_SHIFT;
}
-void arch_pick_mmap_layout(struct mm_struct *mm)
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
unsigned long random_factor = 0UL;
if (current->flags & PF_RANDOMIZE)
random_factor = arch_mmap_rnd();
- if (mmap_is_legacy()) {
+ if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
- mm->mmap_base = mmap_base(random_factor);
+ mm->mmap_base = mmap_base(random_factor, rlim_stack);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
dsb(ish);
}
-#define flush_dcache_mmap_lock(mapping) \
- spin_lock_irq(&(mapping)->tree_lock)
-#define flush_dcache_mmap_unlock(mapping) \
- spin_unlock_irq(&(mapping)->tree_lock)
+#define flush_dcache_mmap_lock(mapping) do { } while (0)
+#define flush_dcache_mmap_unlock(mapping) do { } while (0)
/*
* We don't appear to need to do anything here. In fact, if we did, we'd
#include <asm/page-def.h>
#include <asm/sizes.h>
-/*
- * Allow for constants defined here to be used from assembly code
- * by prepending the UL suffix only with actual C code compilation.
- */
-#define UL(x) _AC(x, UL)
-
/*
* Size of the PCI I/O space. This must remain a power of two so that
* IO_SPACE_LIMIT acts as a mask for the low bits of I/O addresses.
#define MIN_GAP (SZ_128M)
#define MAX_GAP (STACK_TOP/6*5)
-static int mmap_is_legacy(void)
+static int mmap_is_legacy(struct rlimit *rlim_stack)
{
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
- if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
+ if (rlim_stack->rlim_cur == RLIM_INFINITY)
return 1;
return sysctl_legacy_va_layout;
return rnd << PAGE_SHIFT;
}
-static unsigned long mmap_base(unsigned long rnd)
+static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
{
- unsigned long gap = rlimit(RLIMIT_STACK);
+ unsigned long gap = rlim_stack->rlim_cur;
unsigned long pad = (STACK_RND_MASK << PAGE_SHIFT) + stack_guard_gap;
/* Values close to RLIM_INFINITY can overflow. */
* This function, called very early during the creation of a new process VM
* image, sets up which VM layout function to use:
*/
-void arch_pick_mmap_layout(struct mm_struct *mm)
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
unsigned long random_factor = 0UL;
* Fall back to the standard layout if the personality bit is set, or
* if the expected stack growth is unlimited:
*/
- if (mmap_is_legacy()) {
+ if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
- mm->mmap_base = mmap_base(random_factor);
+ mm->mmap_base = mmap_base(random_factor, rlim_stack);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
struct vm_area_struct;
-/* Tell drivers/pci/proc.c that we have pci_mmap_page_range() */
-#define HAVE_PCI_MMAP 1
-#define arch_can_pci_mmap_io() 1
+/* Tell PCI code what kind of PCI resource mappings we support */
+#define HAVE_PCI_MMAP 1
+#define ARCH_GENERIC_PCI_MMAP_RESOURCE 1
+#define arch_can_pci_mmap_io() 1
extern int pci_legacy_read(struct pci_bus *bus, loff_t port, u32 *val,
size_t count);
#define PAGE_KERNEL __pgprot(0) /* these mean nothing to non MMU */
#define pgprot_noncached(x) (x)
+#define pgprot_writecombine pgprot_noncached
+#define pgprot_device pgprot_noncached
#define __swp_type(x) (0)
#define __swp_offset(x) (0)
}
/*
- * Platform support for /proc/bus/pci/X/Y mmap()s,
- * modelled on the sparc64 implementation by Dave Miller.
- * -- paulus.
+ * Platform support for /proc/bus/pci/X/Y mmap()s.
*/
-/*
- * Adjust vm_pgoff of VMA such that it is the physical page offset
- * corresponding to the 32-bit pci bus offset for DEV requested by the user.
- *
- * Basically, the user finds the base address for his device which he wishes
- * to mmap. They read the 32-bit value from the config space base register,
- * add whatever PAGE_SIZE multiple offset they wish, and feed this into the
- * offset parameter of mmap on /proc/bus/pci/XXX for that device.
- *
- * Returns negative error code on failure, zero on success.
- */
-static struct resource *__pci_mmap_make_offset(struct pci_dev *dev,
- resource_size_t *offset,
- enum pci_mmap_state mmap_state)
+int pci_iobar_pfn(struct pci_dev *pdev, int bar, struct vm_area_struct *vma)
{
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
- unsigned long io_offset = 0;
- int i, res_bit;
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ resource_size_t ioaddr = pci_resource_start(pdev, bar);
if (!hose)
- return NULL; /* should never happen */
-
- /* If memory, add on the PCI bridge address offset */
- if (mmap_state == pci_mmap_mem) {
-#if 0 /* See comment in pci_resource_to_user() for why this is disabled */
- *offset += hose->pci_mem_offset;
-#endif
- res_bit = IORESOURCE_MEM;
- } else {
- io_offset = (unsigned long)hose->io_base_virt - _IO_BASE;
- *offset += io_offset;
- res_bit = IORESOURCE_IO;
- }
-
- /*
- * Check that the offset requested corresponds to one of the
- * resources of the device.
- */
- for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
- struct resource *rp = &dev->resource[i];
- int flags = rp->flags;
+ return -EINVAL; /* should never happen */
- /* treat ROM as memory (should be already) */
- if (i == PCI_ROM_RESOURCE)
- flags |= IORESOURCE_MEM;
-
- /* Active and same type? */
- if ((flags & res_bit) == 0)
- continue;
-
- /* In the range of this resource? */
- if (*offset < (rp->start & PAGE_MASK) || *offset > rp->end)
- continue;
-
- /* found it! construct the final physical address */
- if (mmap_state == pci_mmap_io)
- *offset += hose->io_base_phys - io_offset;
- return rp;
- }
+ /* Convert to an offset within this PCI controller */
+ ioaddr -= (unsigned long)hose->io_base_virt - _IO_BASE;
- return NULL;
+ vma->vm_pgoff += (ioaddr + hose->io_base_phys) >> PAGE_SHIFT;
+ return 0;
}
/*
return prot;
}
-/*
- * Perform the actual remap of the pages for a PCI device mapping, as
- * appropriate for this architecture. The region in the process to map
- * is described by vm_start and vm_end members of VMA, the base physical
- * address is found in vm_pgoff.
- * The pci device structure is provided so that architectures may make mapping
- * decisions on a per-device or per-bus basis.
- *
- * Returns a negative error code on failure, zero on success.
- */
-int pci_mmap_page_range(struct pci_dev *dev, int bar, struct vm_area_struct *vma,
- enum pci_mmap_state mmap_state, int write_combine)
-{
- resource_size_t offset =
- ((resource_size_t)vma->vm_pgoff) << PAGE_SHIFT;
- struct resource *rp;
- int ret;
-
- rp = __pci_mmap_make_offset(dev, &offset, mmap_state);
- if (rp == NULL)
- return -EINVAL;
-
- vma->vm_pgoff = offset >> PAGE_SHIFT;
- vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-
- ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
- vma->vm_end - vma->vm_start, vma->vm_page_prot);
-
- return ret;
-}
-
/* This provides legacy IO read access on a bus */
int pci_legacy_read(struct pci_bus *bus, loff_t port, u32 *val, size_t size)
{
#define MAP_NONBLOCK 0x20000 /* do not block on IO */
#define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x80000 /* create a huge page mapping */
+#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
/*
* Flags for msync
#define MIN_GAP (128*1024*1024UL)
#define MAX_GAP ((TASK_SIZE)/6*5)
-static int mmap_is_legacy(void)
+static int mmap_is_legacy(struct rlimit *rlim_stack)
{
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
- if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
+ if (rlim_stack->rlim_cur == RLIM_INFINITY)
return 1;
return sysctl_legacy_va_layout;
}
-static unsigned long mmap_base(unsigned long rnd)
+static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
{
- unsigned long gap = rlimit(RLIMIT_STACK);
+ unsigned long gap = rlim_stack->rlim_cur;
if (gap < MIN_GAP)
gap = MIN_GAP;
return rnd << PAGE_SHIFT;
}
-void arch_pick_mmap_layout(struct mm_struct *mm)
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
unsigned long random_factor = 0UL;
if (current->flags & PF_RANDOMIZE)
random_factor = arch_mmap_rnd();
- if (mmap_is_legacy()) {
+ if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
- mm->mmap_base = mmap_base(random_factor);
+ mm->mmap_base = mmap_base(random_factor, rlim_stack);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
void flush_kernel_dcache_page(struct page *page);
void flush_icache_range(unsigned long start, unsigned long end);
void flush_icache_page(struct vm_area_struct *vma, struct page *page);
-#define flush_dcache_mmap_lock(mapping) spin_lock_irq(&(mapping)->tree_lock)
-#define flush_dcache_mmap_unlock(mapping) spin_unlock_irq(&(mapping)->tree_lock)
+#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&(mapping)->i_pages)
+#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&(mapping)->i_pages)
#else
#include <asm-generic/cacheflush.h>
extern void flush_dcache_range(unsigned long start, unsigned long end);
extern void invalidate_dcache_range(unsigned long start, unsigned long end);
-#define flush_dcache_mmap_lock(mapping) \
- spin_lock_irq(&(mapping)->tree_lock)
-#define flush_dcache_mmap_unlock(mapping) \
- spin_unlock_irq(&(mapping)->tree_lock)
+#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages)
+#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages)
#endif /* _ASM_NIOS2_CACHEFLUSH_H */
return ret;
}
-void read_persistent_clock(struct timespec *ts)
+void read_persistent_clock64(struct timespec64 *ts)
{
- ts->tv_sec = mktime(2007, 1, 1, 0, 0, 0);
+ ts->tv_sec = mktime64(2007, 1, 1, 0, 0, 0);
ts->tv_nsec = 0;
}
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
extern void flush_dcache_page(struct page *page);
-#define flush_dcache_mmap_lock(mapping) \
- spin_lock_irq(&(mapping)->tree_lock)
-#define flush_dcache_mmap_unlock(mapping) \
- spin_unlock_irq(&(mapping)->tree_lock)
+#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages)
+#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages)
#define flush_icache_page(vma,page) do { \
flush_kernel_dcache_page(page); \
#define MAP_NONBLOCK 0x20000 /* do not block on IO */
#define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x80000 /* create a huge page mapping */
+#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
#define MS_SYNC 1 /* synchronous memory sync */
#define MS_ASYNC 2 /* sync memory asynchronously */
* Top of mmap area (just below the process stack).
*/
-static unsigned long mmap_upper_limit(void)
+/*
+ * When called from arch_get_unmapped_area(), rlim_stack will be NULL,
+ * indicating that "current" should be used instead of a passed-in
+ * value from the exec bprm as done with arch_pick_mmap_layout().
+ */
+static unsigned long mmap_upper_limit(struct rlimit *rlim_stack)
{
unsigned long stack_base;
/* Limit stack size - see setup_arg_pages() in fs/exec.c */
- stack_base = rlimit_max(RLIMIT_STACK);
+ stack_base = rlim_stack ? rlim_stack->rlim_max
+ : rlimit_max(RLIMIT_STACK);
if (stack_base > STACK_SIZE_MAX)
stack_base = STACK_SIZE_MAX;
info.flags = 0;
info.length = len;
info.low_limit = mm->mmap_legacy_base;
- info.high_limit = mmap_upper_limit();
+ info.high_limit = mmap_upper_limit(NULL);
info.align_mask = last_mmap ? (PAGE_MASK & (SHM_COLOUR - 1)) : 0;
info.align_offset = shared_align_offset(last_mmap, pgoff);
addr = vm_unmapped_area(&info);
* This function, called very early during the creation of a new
* process VM image, sets up which VM layout function to use:
*/
-void arch_pick_mmap_layout(struct mm_struct *mm)
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
mm->mmap_legacy_base = mmap_legacy_base();
- mm->mmap_base = mmap_upper_limit();
+ mm->mmap_base = mmap_upper_limit(rlim_stack);
if (mmap_is_legacy()) {
mm->mmap_base = mm->mmap_legacy_base;
#define MIN_GAP (128*1024*1024)
#define MAX_GAP (TASK_SIZE/6*5)
-static inline int mmap_is_legacy(void)
+static inline int mmap_is_legacy(struct rlimit *rlim_stack)
{
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
- if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
+ if (rlim_stack->rlim_cur == RLIM_INFINITY)
return 1;
return sysctl_legacy_va_layout;
return (1<<30);
}
-static inline unsigned long mmap_base(unsigned long rnd)
+static inline unsigned long mmap_base(unsigned long rnd,
+ struct rlimit *rlim_stack)
{
- unsigned long gap = rlimit(RLIMIT_STACK);
+ unsigned long gap = rlim_stack->rlim_cur;
unsigned long pad = stack_maxrandom_size() + stack_guard_gap;
/* Values close to RLIM_INFINITY can overflow. */
}
static void radix__arch_pick_mmap_layout(struct mm_struct *mm,
- unsigned long random_factor)
+ unsigned long random_factor,
+ struct rlimit *rlim_stack)
{
- if (mmap_is_legacy()) {
+ if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = radix__arch_get_unmapped_area;
} else {
- mm->mmap_base = mmap_base(random_factor);
+ mm->mmap_base = mmap_base(random_factor, rlim_stack);
mm->get_unmapped_area = radix__arch_get_unmapped_area_topdown;
}
}
#else
/* dummy */
extern void radix__arch_pick_mmap_layout(struct mm_struct *mm,
- unsigned long random_factor);
+ unsigned long random_factor,
+ struct rlimit *rlim_stack);
#endif
/*
* This function, called very early during the creation of a new
* process VM image, sets up which VM layout function to use:
*/
-void arch_pick_mmap_layout(struct mm_struct *mm)
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
unsigned long random_factor = 0UL;
random_factor = arch_mmap_rnd();
if (radix_enabled())
- return radix__arch_pick_mmap_layout(mm, random_factor);
+ return radix__arch_pick_mmap_layout(mm, random_factor,
+ rlim_stack);
/*
* Fall back to the standard layout if the personality
* bit is set, or if the expected stack growth is unlimited:
*/
- if (mmap_is_legacy()) {
+ if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
- mm->mmap_base = mmap_base(random_factor);
+ mm->mmap_base = mmap_base(random_factor, rlim_stack);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
/*
* Taken from alloc_migrate_target with changes to remove CMA allocations
*/
-struct page *new_iommu_non_cma_page(struct page *page, unsigned long private,
- int **resultp)
+struct page *new_iommu_non_cma_page(struct page *page, unsigned long private)
{
gfp_t gfp_mask = GFP_USER;
struct page *new_page;
#define MIN_GAP (32*1024*1024)
#define MAX_GAP (STACK_TOP/6*5)
-static inline int mmap_is_legacy(void)
+static inline int mmap_is_legacy(struct rlimit *rlim_stack)
{
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
- if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
+ if (rlim_stack->rlim_cur == RLIM_INFINITY)
return 1;
return sysctl_legacy_va_layout;
}
return TASK_UNMAPPED_BASE + rnd;
}
-static inline unsigned long mmap_base(unsigned long rnd)
+static inline unsigned long mmap_base(unsigned long rnd,
+ struct rlimit *rlim_stack)
{
- unsigned long gap = rlimit(RLIMIT_STACK);
+ unsigned long gap = rlim_stack->rlim_cur;
if (gap < MIN_GAP)
gap = MIN_GAP;
* This function, called very early during the creation of a new
* process VM image, sets up which VM layout function to use:
*/
-void arch_pick_mmap_layout(struct mm_struct *mm)
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
unsigned long random_factor = 0UL;
* Fall back to the standard layout if the personality
* bit is set, or if the expected stack growth is unlimited:
*/
- if (mmap_is_legacy()) {
+ if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = mmap_base_legacy(random_factor);
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
- mm->mmap_base = mmap_base(random_factor);
+ mm->mmap_base = mmap_base(random_factor, rlim_stack);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
return rnd << PAGE_SHIFT;
}
-void arch_pick_mmap_layout(struct mm_struct *mm)
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
unsigned long random_factor = mmap_rnd();
unsigned long gap;
* Fall back to the standard layout if the personality
* bit is set, or if the expected stack growth is unlimited:
*/
- gap = rlimit(RLIMIT_STACK);
+ gap = rlim_stack->rlim_cur;
if (!test_thread_flag(TIF_32BIT) ||
(current->personality & ADDR_COMPAT_LAYOUT) ||
gap == RLIM_INFINITY ||
more than one without conflict. If you don't need UML networking,
say N.
+config UML_NET_VECTOR
+ bool "Vector I/O high performance network devices"
+ depends on UML_NET
+ help
+ This User-Mode Linux network driver uses multi-message send
+ and receive functions. The host running the UML guest must have
+ a linux kernel version above 3.0 and a libc version > 2.13.
+ This driver provides tap, raw, gre and l2tpv3 network transports
+ with up to 4 times higher network throughput than the UML network
+ drivers.
+
config UML_NET_VDE
bool "VDE transport"
depends on UML_NET
slip-objs := slip_kern.o slip_user.o
slirp-objs := slirp_kern.o slirp_user.o
daemon-objs := daemon_kern.o daemon_user.o
+vector-objs := vector_kern.o vector_user.o vector_transports.o
umcast-objs := umcast_kern.o umcast_user.o
net-objs := net_kern.o net_user.o
mconsole-objs := mconsole_kern.o mconsole_user.o
obj-$(CONFIG_UML_NET_SLIP) += slip.o slip_common.o
obj-$(CONFIG_UML_NET_SLIRP) += slirp.o slip_common.o
obj-$(CONFIG_UML_NET_DAEMON) += daemon.o
+obj-$(CONFIG_UML_NET_VECTOR) += vector.o
obj-$(CONFIG_UML_NET_VDE) += vde.o
obj-$(CONFIG_UML_NET_MCAST) += umcast.o
obj-$(CONFIG_UML_NET_PCAP) += pcap.o
obj-$(CONFIG_UML_RANDOM) += random.o
# pcap_user.o must be added explicitly.
-USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o pcap_user.o vde_user.o
+USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o pcap_user.o vde_user.o vector_user.o
CFLAGS_null.o = -DDEV_NULL=$(DEV_NULL_PATH)
include arch/um/scripts/Makefile.rules
return err;
}
-/* Items are added in IRQ context, when free_irq can't be called, and
- * removed in process context, when it can.
- * This handles interrupt sources which disappear, and which need to
- * be permanently disabled. This is discovered in IRQ context, but
- * the freeing of the IRQ must be done later.
- */
-static DEFINE_SPINLOCK(irqs_to_free_lock);
-static LIST_HEAD(irqs_to_free);
-
-void free_irqs(void)
-{
- struct chan *chan;
- LIST_HEAD(list);
- struct list_head *ele;
- unsigned long flags;
-
- spin_lock_irqsave(&irqs_to_free_lock, flags);
- list_splice_init(&irqs_to_free, &list);
- spin_unlock_irqrestore(&irqs_to_free_lock, flags);
-
- list_for_each(ele, &list) {
- chan = list_entry(ele, struct chan, free_list);
-
- if (chan->input && chan->enabled)
- um_free_irq(chan->line->driver->read_irq, chan);
- if (chan->output && chan->enabled)
- um_free_irq(chan->line->driver->write_irq, chan);
- chan->enabled = 0;
- }
-}
-
static void close_one_chan(struct chan *chan, int delay_free_irq)
{
- unsigned long flags;
-
if (!chan->opened)
return;
- if (delay_free_irq) {
- spin_lock_irqsave(&irqs_to_free_lock, flags);
- list_add(&chan->free_list, &irqs_to_free);
- spin_unlock_irqrestore(&irqs_to_free_lock, flags);
- }
- else {
- if (chan->input && chan->enabled)
- um_free_irq(chan->line->driver->read_irq, chan);
- if (chan->output && chan->enabled)
- um_free_irq(chan->line->driver->write_irq, chan);
- chan->enabled = 0;
- }
+ /* we can safely call free now - it will be marked
+ * as free and freed once the IRQ stopped processing
+ */
+ if (chan->input && chan->enabled)
+ um_free_irq(chan->line->driver->read_irq, chan);
+ if (chan->output && chan->enabled)
+ um_free_irq(chan->line->driver->write_irq, chan);
+ chan->enabled = 0;
if (chan->ops->close != NULL)
(*chan->ops->close)(chan->fd, chan->data);
if (err)
return err;
if (output)
- err = um_request_irq(driver->write_irq, fd, IRQ_WRITE,
+ err = um_request_irq(driver->write_irq, fd, IRQ_NONE,
line_write_interrupt, IRQF_SHARED,
driver->write_irq_name, data);
return err;
#endif
}
-static void setup_etheraddr(struct net_device *dev, char *str)
+void uml_net_setup_etheraddr(struct net_device *dev, char *str)
{
unsigned char *addr = dev->dev_addr;
char *end;
*/
snprintf(dev->name, sizeof(dev->name), "eth%d", n);
- setup_etheraddr(dev, mac);
+ uml_net_setup_etheraddr(dev, mac);
printk(KERN_INFO "Netdevice %d (%pM) : ", n, dev->dev_addr);
#include <linux/miscdevice.h>
#include <linux/delay.h>
#include <linux/uaccess.h>
+#include <init.h>
#include <irq_kern.h>
#include <os.h>
/*
* rng_cleanup - shutdown RNG module
*/
-static void __exit rng_cleanup (void)
+
+static void cleanup(void)
+{
+ free_irq_by_fd(random_fd);
+ os_close_file(random_fd);
+}
+
+static void __exit rng_cleanup(void)
{
os_close_file(random_fd);
misc_deregister (&rng_miscdev);
module_init (rng_init);
module_exit (rng_cleanup);
+__uml_exitcall(cleanup);
MODULE_DESCRIPTION("UML Host Random Number Generator (RNG) driver");
MODULE_LICENSE("GPL");
do {
res = os_write_file(kernel_fd, ((char *) io_req_buffer) + written, n);
- if (res > 0) {
+ if (res >= 0) {
written += res;
} else {
if (res != -EAGAIN) {
- printk("io_thread - read failed, fd = %d, "
+ printk("io_thread - write failed, fd = %d, "
"err = %d\n", kernel_fd, -n);
}
}
--- /dev/null
+/*
+ * Copyright (C) 2017 - Cambridge Greys Limited
+ * Copyright (C) 2011 - 2014 Cisco Systems Inc
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
+ * James Leu (jleu@mindspring.net).
+ * Copyright (C) 2001 by various other people who didn't put their name here.
+ * Licensed under the GPL.
+ */
+
+#include <linux/version.h>
+#include <linux/bootmem.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/inetdevice.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/platform_device.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <init.h>
+#include <irq_kern.h>
+#include <irq_user.h>
+#include <net_kern.h>
+#include <os.h>
+#include "mconsole_kern.h"
+#include "vector_user.h"
+#include "vector_kern.h"
+
+/*
+ * Adapted from network devices with the following major changes:
+ * All transports are static - simplifies the code significantly
+ * Multiple FDs/IRQs per device
+ * Vector IO optionally used for read/write, falling back to legacy
+ * based on configuration and/or availability
+ * Configuration is no longer positional - L2TPv3 and GRE require up to
+ * 10 parameters, passing this as positional is not fit for purpose.
+ * Only socket transports are supported
+ */
+
+
+#define DRIVER_NAME "uml-vector"
+#define DRIVER_VERSION "01"
+struct vector_cmd_line_arg {
+ struct list_head list;
+ int unit;
+ char *arguments;
+};
+
+struct vector_device {
+ struct list_head list;
+ struct net_device *dev;
+ struct platform_device pdev;
+ int unit;
+ int opened;
+};
+
+static LIST_HEAD(vec_cmd_line);
+
+static DEFINE_SPINLOCK(vector_devices_lock);
+static LIST_HEAD(vector_devices);
+
+static int driver_registered;
+
+static void vector_eth_configure(int n, struct arglist *def);
+
+/* Argument accessors to set variables (and/or set default values)
+ * mtu, buffer sizing, default headroom, etc
+ */
+
+#define DEFAULT_HEADROOM 2
+#define SAFETY_MARGIN 32
+#define DEFAULT_VECTOR_SIZE 64
+#define TX_SMALL_PACKET 128
+#define MAX_IOV_SIZE (MAX_SKB_FRAGS + 1)
+
+static const struct {
+ const char string[ETH_GSTRING_LEN];
+} ethtool_stats_keys[] = {
+ { "rx_queue_max" },
+ { "rx_queue_running_average" },
+ { "tx_queue_max" },
+ { "tx_queue_running_average" },
+ { "rx_encaps_errors" },
+ { "tx_timeout_count" },
+ { "tx_restart_queue" },
+ { "tx_kicks" },
+ { "tx_flow_control_xon" },
+ { "tx_flow_control_xoff" },
+ { "rx_csum_offload_good" },
+ { "rx_csum_offload_errors"},
+ { "sg_ok"},
+ { "sg_linearized"},
+};
+
+#define VECTOR_NUM_STATS ARRAY_SIZE(ethtool_stats_keys)
+
+static void vector_reset_stats(struct vector_private *vp)
+{
+ vp->estats.rx_queue_max = 0;
+ vp->estats.rx_queue_running_average = 0;
+ vp->estats.tx_queue_max = 0;
+ vp->estats.tx_queue_running_average = 0;
+ vp->estats.rx_encaps_errors = 0;
+ vp->estats.tx_timeout_count = 0;
+ vp->estats.tx_restart_queue = 0;
+ vp->estats.tx_kicks = 0;
+ vp->estats.tx_flow_control_xon = 0;
+ vp->estats.tx_flow_control_xoff = 0;
+ vp->estats.sg_ok = 0;
+ vp->estats.sg_linearized = 0;
+}
+
+static int get_mtu(struct arglist *def)
+{
+ char *mtu = uml_vector_fetch_arg(def, "mtu");
+ long result;
+
+ if (mtu != NULL) {
+ if (kstrtoul(mtu, 10, &result) == 0)
+ return result;
+ }
+ return ETH_MAX_PACKET;
+}
+
+static int get_depth(struct arglist *def)
+{
+ char *mtu = uml_vector_fetch_arg(def, "depth");
+ long result;
+
+ if (mtu != NULL) {
+ if (kstrtoul(mtu, 10, &result) == 0)
+ return result;
+ }
+ return DEFAULT_VECTOR_SIZE;
+}
+
+static int get_headroom(struct arglist *def)
+{
+ char *mtu = uml_vector_fetch_arg(def, "headroom");
+ long result;
+
+ if (mtu != NULL) {
+ if (kstrtoul(mtu, 10, &result) == 0)
+ return result;
+ }
+ return DEFAULT_HEADROOM;
+}
+
+static int get_req_size(struct arglist *def)
+{
+ char *gro = uml_vector_fetch_arg(def, "gro");
+ long result;
+
+ if (gro != NULL) {
+ if (kstrtoul(gro, 10, &result) == 0) {
+ if (result > 0)
+ return 65536;
+ }
+ }
+ return get_mtu(def) + ETH_HEADER_OTHER +
+ get_headroom(def) + SAFETY_MARGIN;
+}
+
+
+static int get_transport_options(struct arglist *def)
+{
+ char *transport = uml_vector_fetch_arg(def, "transport");
+ char *vector = uml_vector_fetch_arg(def, "vec");
+
+ int vec_rx = VECTOR_RX;
+ int vec_tx = VECTOR_TX;
+ long parsed;
+
+ if (vector != NULL) {
+ if (kstrtoul(vector, 10, &parsed) == 0) {
+ if (parsed == 0) {
+ vec_rx = 0;
+ vec_tx = 0;
+ }
+ }
+ }
+
+
+ if (strncmp(transport, TRANS_TAP, TRANS_TAP_LEN) == 0)
+ return (vec_rx | VECTOR_BPF);
+ if (strncmp(transport, TRANS_RAW, TRANS_RAW_LEN) == 0)
+ return (vec_rx | vec_tx);
+ return (vec_rx | vec_tx);
+}
+
+
+/* A mini-buffer for packet drop read
+ * All of our supported transports are datagram oriented and we always
+ * read using recvmsg or recvmmsg. If we pass a buffer which is smaller
+ * than the packet size it still counts as full packet read and will
+ * clean the incoming stream to keep sigio/epoll happy
+ */
+
+#define DROP_BUFFER_SIZE 32
+
+static char *drop_buffer;
+
+/* Array backed queues optimized for bulk enqueue/dequeue and
+ * 1:N (small values of N) or 1:1 enqueuer/dequeuer ratios.
+ * For more details and full design rationale see
+ * http://foswiki.cambridgegreys.com/Main/EatYourTailAndEnjoyIt
+ */
+
+
+/*
+ * Advance the mmsg queue head by n = advance. Resets the queue to
+ * maximum enqueue/dequeue-at-once capacity if possible. Called by
+ * dequeuers. Caller must hold the head_lock!
+ */
+
+static int vector_advancehead(struct vector_queue *qi, int advance)
+{
+ int queue_depth;
+
+ qi->head =
+ (qi->head + advance)
+ % qi->max_depth;
+
+
+ spin_lock(&qi->tail_lock);
+ qi->queue_depth -= advance;
+
+ /* we are at 0, use this to
+ * reset head and tail so we can use max size vectors
+ */
+
+ if (qi->queue_depth == 0) {
+ qi->head = 0;
+ qi->tail = 0;
+ }
+ queue_depth = qi->queue_depth;
+ spin_unlock(&qi->tail_lock);
+ return queue_depth;
+}
+
+/* Advance the queue tail by n = advance.
+ * This is called by enqueuers which should hold the
+ * head lock already
+ */
+
+static int vector_advancetail(struct vector_queue *qi, int advance)
+{
+ int queue_depth;
+
+ qi->tail =
+ (qi->tail + advance)
+ % qi->max_depth;
+ spin_lock(&qi->head_lock);
+ qi->queue_depth += advance;
+ queue_depth = qi->queue_depth;
+ spin_unlock(&qi->head_lock);
+ return queue_depth;
+}
+
+static int prep_msg(struct vector_private *vp,
+ struct sk_buff *skb,
+ struct iovec *iov)
+{
+ int iov_index = 0;
+ int nr_frags, frag;
+ skb_frag_t *skb_frag;
+
+ nr_frags = skb_shinfo(skb)->nr_frags;
+ if (nr_frags > MAX_IOV_SIZE) {
+ if (skb_linearize(skb) != 0)
+ goto drop;
+ }
+ if (vp->header_size > 0) {
+ iov[iov_index].iov_len = vp->header_size;
+ vp->form_header(iov[iov_index].iov_base, skb, vp);
+ iov_index++;
+ }
+ iov[iov_index].iov_base = skb->data;
+ if (nr_frags > 0) {
+ iov[iov_index].iov_len = skb->len - skb->data_len;
+ vp->estats.sg_ok++;
+ } else
+ iov[iov_index].iov_len = skb->len;
+ iov_index++;
+ for (frag = 0; frag < nr_frags; frag++) {
+ skb_frag = &skb_shinfo(skb)->frags[frag];
+ iov[iov_index].iov_base = skb_frag_address_safe(skb_frag);
+ iov[iov_index].iov_len = skb_frag_size(skb_frag);
+ iov_index++;
+ }
+ return iov_index;
+drop:
+ return -1;
+}
+/*
+ * Generic vector enqueue with support for forming headers using transport
+ * specific callback. Allows GRE, L2TPv3, RAW and other transports
+ * to use a common enqueue procedure in vector mode
+ */
+
+static int vector_enqueue(struct vector_queue *qi, struct sk_buff *skb)
+{
+ struct vector_private *vp = netdev_priv(qi->dev);
+ int queue_depth;
+ int packet_len;
+ struct mmsghdr *mmsg_vector = qi->mmsg_vector;
+ int iov_count;
+
+ spin_lock(&qi->tail_lock);
+ spin_lock(&qi->head_lock);
+ queue_depth = qi->queue_depth;
+ spin_unlock(&qi->head_lock);
+
+ if (skb)
+ packet_len = skb->len;
+
+ if (queue_depth < qi->max_depth) {
+
+ *(qi->skbuff_vector + qi->tail) = skb;
+ mmsg_vector += qi->tail;
+ iov_count = prep_msg(
+ vp,
+ skb,
+ mmsg_vector->msg_hdr.msg_iov
+ );
+ if (iov_count < 1)
+ goto drop;
+ mmsg_vector->msg_hdr.msg_iovlen = iov_count;
+ mmsg_vector->msg_hdr.msg_name = vp->fds->remote_addr;
+ mmsg_vector->msg_hdr.msg_namelen = vp->fds->remote_addr_size;
+ queue_depth = vector_advancetail(qi, 1);
+ } else
+ goto drop;
+ spin_unlock(&qi->tail_lock);
+ return queue_depth;
+drop:
+ qi->dev->stats.tx_dropped++;
+ if (skb != NULL) {
+ packet_len = skb->len;
+ dev_consume_skb_any(skb);
+ netdev_completed_queue(qi->dev, 1, packet_len);
+ }
+ spin_unlock(&qi->tail_lock);
+ return queue_depth;
+}
+
+static int consume_vector_skbs(struct vector_queue *qi, int count)
+{
+ struct sk_buff *skb;
+ int skb_index;
+ int bytes_compl = 0;
+
+ for (skb_index = qi->head; skb_index < qi->head + count; skb_index++) {
+ skb = *(qi->skbuff_vector + skb_index);
+ /* mark as empty to ensure correct destruction if
+ * needed
+ */
+ bytes_compl += skb->len;
+ *(qi->skbuff_vector + skb_index) = NULL;
+ dev_consume_skb_any(skb);
+ }
+ qi->dev->stats.tx_bytes += bytes_compl;
+ qi->dev->stats.tx_packets += count;
+ netdev_completed_queue(qi->dev, count, bytes_compl);
+ return vector_advancehead(qi, count);
+}
+
+/*
+ * Generic vector deque via sendmmsg with support for forming headers
+ * using transport specific callback. Allows GRE, L2TPv3, RAW and
+ * other transports to use a common dequeue procedure in vector mode
+ */
+
+
+static int vector_send(struct vector_queue *qi)
+{
+ struct vector_private *vp = netdev_priv(qi->dev);
+ struct mmsghdr *send_from;
+ int result = 0, send_len, queue_depth = qi->max_depth;
+
+ if (spin_trylock(&qi->head_lock)) {
+ if (spin_trylock(&qi->tail_lock)) {
+ /* update queue_depth to current value */
+ queue_depth = qi->queue_depth;
+ spin_unlock(&qi->tail_lock);
+ while (queue_depth > 0) {
+ /* Calculate the start of the vector */
+ send_len = queue_depth;
+ send_from = qi->mmsg_vector;
+ send_from += qi->head;
+ /* Adjust vector size if wraparound */
+ if (send_len + qi->head > qi->max_depth)
+ send_len = qi->max_depth - qi->head;
+ /* Try to TX as many packets as possible */
+ if (send_len > 0) {
+ result = uml_vector_sendmmsg(
+ vp->fds->tx_fd,
+ send_from,
+ send_len,
+ 0
+ );
+ vp->in_write_poll =
+ (result != send_len);
+ }
+ /* For some of the sendmmsg error scenarios
+ * we may end being unsure in the TX success
+ * for all packets. It is safer to declare
+ * them all TX-ed and blame the network.
+ */
+ if (result < 0) {
+ if (net_ratelimit())
+ netdev_err(vp->dev, "sendmmsg err=%i\n",
+ result);
+ result = send_len;
+ }
+ if (result > 0) {
+ queue_depth =
+ consume_vector_skbs(qi, result);
+ /* This is equivalent to an TX IRQ.
+ * Restart the upper layers to feed us
+ * more packets.
+ */
+ if (result > vp->estats.tx_queue_max)
+ vp->estats.tx_queue_max = result;
+ vp->estats.tx_queue_running_average =
+ (vp->estats.tx_queue_running_average + result) >> 1;
+ }
+ netif_trans_update(qi->dev);
+ netif_wake_queue(qi->dev);
+ /* if TX is busy, break out of the send loop,
+ * poll write IRQ will reschedule xmit for us
+ */
+ if (result != send_len) {
+ vp->estats.tx_restart_queue++;
+ break;
+ }
+ }
+ }
+ spin_unlock(&qi->head_lock);
+ } else {
+ tasklet_schedule(&vp->tx_poll);
+ }
+ return queue_depth;
+}
+
+/* Queue destructor. Deliberately stateless so we can use
+ * it in queue cleanup if initialization fails.
+ */
+
+static void destroy_queue(struct vector_queue *qi)
+{
+ int i;
+ struct iovec *iov;
+ struct vector_private *vp = netdev_priv(qi->dev);
+ struct mmsghdr *mmsg_vector;
+
+ if (qi == NULL)
+ return;
+ /* deallocate any skbuffs - we rely on any unused to be
+ * set to NULL.
+ */
+ if (qi->skbuff_vector != NULL) {
+ for (i = 0; i < qi->max_depth; i++) {
+ if (*(qi->skbuff_vector + i) != NULL)
+ dev_kfree_skb_any(*(qi->skbuff_vector + i));
+ }
+ kfree(qi->skbuff_vector);
+ }
+ /* deallocate matching IOV structures including header buffs */
+ if (qi->mmsg_vector != NULL) {
+ mmsg_vector = qi->mmsg_vector;
+ for (i = 0; i < qi->max_depth; i++) {
+ iov = mmsg_vector->msg_hdr.msg_iov;
+ if (iov != NULL) {
+ if ((vp->header_size > 0) &&
+ (iov->iov_base != NULL))
+ kfree(iov->iov_base);
+ kfree(iov);
+ }
+ mmsg_vector++;
+ }
+ kfree(qi->mmsg_vector);
+ }
+ kfree(qi);
+}
+
+/*
+ * Queue constructor. Create a queue with a given side.
+ */
+static struct vector_queue *create_queue(
+ struct vector_private *vp,
+ int max_size,
+ int header_size,
+ int num_extra_frags)
+{
+ struct vector_queue *result;
+ int i;
+ struct iovec *iov;
+ struct mmsghdr *mmsg_vector;
+
+ result = kmalloc(sizeof(struct vector_queue), GFP_KERNEL);
+ if (result == NULL)
+ goto out_fail;
+ result->max_depth = max_size;
+ result->dev = vp->dev;
+ result->mmsg_vector = kmalloc(
+ (sizeof(struct mmsghdr) * max_size), GFP_KERNEL);
+ result->skbuff_vector = kmalloc(
+ (sizeof(void *) * max_size), GFP_KERNEL);
+ if (result->mmsg_vector == NULL || result->skbuff_vector == NULL)
+ goto out_fail;
+
+ mmsg_vector = result->mmsg_vector;
+ for (i = 0; i < max_size; i++) {
+ /* Clear all pointers - we use non-NULL as marking on
+ * what to free on destruction
+ */
+ *(result->skbuff_vector + i) = NULL;
+ mmsg_vector->msg_hdr.msg_iov = NULL;
+ mmsg_vector++;
+ }
+ mmsg_vector = result->mmsg_vector;
+ result->max_iov_frags = num_extra_frags;
+ for (i = 0; i < max_size; i++) {
+ if (vp->header_size > 0)
+ iov = kmalloc(
+ sizeof(struct iovec) * (3 + num_extra_frags),
+ GFP_KERNEL
+ );
+ else
+ iov = kmalloc(
+ sizeof(struct iovec) * (2 + num_extra_frags),
+ GFP_KERNEL
+ );
+ if (iov == NULL)
+ goto out_fail;
+ mmsg_vector->msg_hdr.msg_iov = iov;
+ mmsg_vector->msg_hdr.msg_iovlen = 1;
+ mmsg_vector->msg_hdr.msg_control = NULL;
+ mmsg_vector->msg_hdr.msg_controllen = 0;
+ mmsg_vector->msg_hdr.msg_flags = MSG_DONTWAIT;
+ mmsg_vector->msg_hdr.msg_name = NULL;
+ mmsg_vector->msg_hdr.msg_namelen = 0;
+ if (vp->header_size > 0) {
+ iov->iov_base = kmalloc(header_size, GFP_KERNEL);
+ if (iov->iov_base == NULL)
+ goto out_fail;
+ iov->iov_len = header_size;
+ mmsg_vector->msg_hdr.msg_iovlen = 2;
+ iov++;
+ }
+ iov->iov_base = NULL;
+ iov->iov_len = 0;
+ mmsg_vector++;
+ }
+ spin_lock_init(&result->head_lock);
+ spin_lock_init(&result->tail_lock);
+ result->queue_depth = 0;
+ result->head = 0;
+ result->tail = 0;
+ return result;
+out_fail:
+ destroy_queue(result);
+ return NULL;
+}
+
+/*
+ * We do not use the RX queue as a proper wraparound queue for now
+ * This is not necessary because the consumption via netif_rx()
+ * happens in-line. While we can try using the return code of
+ * netif_rx() for flow control there are no drivers doing this today.
+ * For this RX specific use we ignore the tail/head locks and
+ * just read into a prepared queue filled with skbuffs.
+ */
+
+static struct sk_buff *prep_skb(
+ struct vector_private *vp,
+ struct user_msghdr *msg)
+{
+ int linear = vp->max_packet + vp->headroom + SAFETY_MARGIN;
+ struct sk_buff *result;
+ int iov_index = 0, len;
+ struct iovec *iov = msg->msg_iov;
+ int err, nr_frags, frag;
+ skb_frag_t *skb_frag;
+
+ if (vp->req_size <= linear)
+ len = linear;
+ else
+ len = vp->req_size;
+ result = alloc_skb_with_frags(
+ linear,
+ len - vp->max_packet,
+ 3,
+ &err,
+ GFP_ATOMIC
+ );
+ if (vp->header_size > 0)
+ iov_index++;
+ if (result == NULL) {
+ iov[iov_index].iov_base = NULL;
+ iov[iov_index].iov_len = 0;
+ goto done;
+ }
+ skb_reserve(result, vp->headroom);
+ result->dev = vp->dev;
+ skb_put(result, vp->max_packet);
+ result->data_len = len - vp->max_packet;
+ result->len += len - vp->max_packet;
+ skb_reset_mac_header(result);
+ result->ip_summed = CHECKSUM_NONE;
+ iov[iov_index].iov_base = result->data;
+ iov[iov_index].iov_len = vp->max_packet;
+ iov_index++;
+
+ nr_frags = skb_shinfo(result)->nr_frags;
+ for (frag = 0; frag < nr_frags; frag++) {
+ skb_frag = &skb_shinfo(result)->frags[frag];
+ iov[iov_index].iov_base = skb_frag_address_safe(skb_frag);
+ if (iov[iov_index].iov_base != NULL)
+ iov[iov_index].iov_len = skb_frag_size(skb_frag);
+ else
+ iov[iov_index].iov_len = 0;
+ iov_index++;
+ }
+done:
+ msg->msg_iovlen = iov_index;
+ return result;
+}
+
+
+/* Prepare queue for recvmmsg one-shot rx - fill with fresh sk_buffs*/
+
+static void prep_queue_for_rx(struct vector_queue *qi)
+{
+ struct vector_private *vp = netdev_priv(qi->dev);
+ struct mmsghdr *mmsg_vector = qi->mmsg_vector;
+ void **skbuff_vector = qi->skbuff_vector;
+ int i;
+
+ if (qi->queue_depth == 0)
+ return;
+ for (i = 0; i < qi->queue_depth; i++) {
+ /* it is OK if allocation fails - recvmmsg with NULL data in
+ * iov argument still performs an RX, just drops the packet
+ * This allows us stop faffing around with a "drop buffer"
+ */
+
+ *skbuff_vector = prep_skb(vp, &mmsg_vector->msg_hdr);
+ skbuff_vector++;
+ mmsg_vector++;
+ }
+ qi->queue_depth = 0;
+}
+
+static struct vector_device *find_device(int n)
+{
+ struct vector_device *device;
+ struct list_head *ele;
+
+ spin_lock(&vector_devices_lock);
+ list_for_each(ele, &vector_devices) {
+ device = list_entry(ele, struct vector_device, list);
+ if (device->unit == n)
+ goto out;
+ }
+ device = NULL;
+ out:
+ spin_unlock(&vector_devices_lock);
+ return device;
+}
+
+static int vector_parse(char *str, int *index_out, char **str_out,
+ char **error_out)
+{
+ int n, len, err;
+ char *start = str;
+
+ len = strlen(str);
+
+ while ((*str != ':') && (strlen(str) > 1))
+ str++;
+ if (*str != ':') {
+ *error_out = "Expected ':' after device number";
+ return -EINVAL;
+ }
+ *str = '\0';
+
+ err = kstrtouint(start, 0, &n);
+ if (err < 0) {
+ *error_out = "Bad device number";
+ return err;
+ }
+
+ str++;
+ if (find_device(n)) {
+ *error_out = "Device already configured";
+ return -EINVAL;
+ }
+
+ *index_out = n;
+ *str_out = str;
+ return 0;
+}
+
+static int vector_config(char *str, char **error_out)
+{
+ int err, n;
+ char *params;
+ struct arglist *parsed;
+
+ err = vector_parse(str, &n, ¶ms, error_out);
+ if (err != 0)
+ return err;
+
+ /* This string is broken up and the pieces used by the underlying
+ * driver. We should copy it to make sure things do not go wrong
+ * later.
+ */
+
+ params = kstrdup(params, GFP_KERNEL);
+ if (params == NULL) {
+ *error_out = "vector_config failed to strdup string";
+ return -ENOMEM;
+ }
+
+ parsed = uml_parse_vector_ifspec(params);
+
+ if (parsed == NULL) {
+ *error_out = "vector_config failed to parse parameters";
+ return -EINVAL;
+ }
+
+ vector_eth_configure(n, parsed);
+ return 0;
+}
+
+static int vector_id(char **str, int *start_out, int *end_out)
+{
+ char *end;
+ int n;
+
+ n = simple_strtoul(*str, &end, 0);
+ if ((*end != '\0') || (end == *str))
+ return -1;
+
+ *start_out = n;
+ *end_out = n;
+ *str = end;
+ return n;
+}
+
+static int vector_remove(int n, char **error_out)
+{
+ struct vector_device *vec_d;
+ struct net_device *dev;
+ struct vector_private *vp;
+
+ vec_d = find_device(n);
+ if (vec_d == NULL)
+ return -ENODEV;
+ dev = vec_d->dev;
+ vp = netdev_priv(dev);
+ if (vp->fds != NULL)
+ return -EBUSY;
+ unregister_netdev(dev);
+ platform_device_unregister(&vec_d->pdev);
+ return 0;
+}
+
+/*
+ * There is no shared per-transport initialization code, so
+ * we will just initialize each interface one by one and
+ * add them to a list
+ */
+
+static struct platform_driver uml_net_driver = {
+ .driver = {
+ .name = DRIVER_NAME,
+ },
+};
+
+
+static void vector_device_release(struct device *dev)
+{
+ struct vector_device *device = dev_get_drvdata(dev);
+ struct net_device *netdev = device->dev;
+
+ list_del(&device->list);
+ kfree(device);
+ free_netdev(netdev);
+}
+
+/* Bog standard recv using recvmsg - not used normally unless the user
+ * explicitly specifies not to use recvmmsg vector RX.
+ */
+
+static int vector_legacy_rx(struct vector_private *vp)
+{
+ int pkt_len;
+ struct user_msghdr hdr;
+ struct iovec iov[2 + MAX_IOV_SIZE]; /* header + data use case only */
+ int iovpos = 0;
+ struct sk_buff *skb;
+ int header_check;
+
+ hdr.msg_name = NULL;
+ hdr.msg_namelen = 0;
+ hdr.msg_iov = (struct iovec *) &iov;
+ hdr.msg_control = NULL;
+ hdr.msg_controllen = 0;
+ hdr.msg_flags = 0;
+
+ if (vp->header_size > 0) {
+ iov[0].iov_base = vp->header_rxbuffer;
+ iov[0].iov_len = vp->header_size;
+ }
+
+ skb = prep_skb(vp, &hdr);
+
+ if (skb == NULL) {
+ /* Read a packet into drop_buffer and don't do
+ * anything with it.
+ */
+ iov[iovpos].iov_base = drop_buffer;
+ iov[iovpos].iov_len = DROP_BUFFER_SIZE;
+ hdr.msg_iovlen = 1;
+ vp->dev->stats.rx_dropped++;
+ }
+
+ pkt_len = uml_vector_recvmsg(vp->fds->rx_fd, &hdr, 0);
+
+ if (skb != NULL) {
+ if (pkt_len > vp->header_size) {
+ if (vp->header_size > 0) {
+ header_check = vp->verify_header(
+ vp->header_rxbuffer, skb, vp);
+ if (header_check < 0) {
+ dev_kfree_skb_irq(skb);
+ vp->dev->stats.rx_dropped++;
+ vp->estats.rx_encaps_errors++;
+ return 0;
+ }
+ if (header_check > 0) {
+ vp->estats.rx_csum_offload_good++;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ }
+ pskb_trim(skb, pkt_len - vp->rx_header_size);
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ vp->dev->stats.rx_bytes += skb->len;
+ vp->dev->stats.rx_packets++;
+ netif_rx(skb);
+ } else {
+ dev_kfree_skb_irq(skb);
+ }
+ }
+ return pkt_len;
+}
+
+/*
+ * Packet at a time TX which falls back to vector TX if the
+ * underlying transport is busy.
+ */
+
+
+
+static int writev_tx(struct vector_private *vp, struct sk_buff *skb)
+{
+ struct iovec iov[3 + MAX_IOV_SIZE];
+ int iov_count, pkt_len = 0;
+
+ iov[0].iov_base = vp->header_txbuffer;
+ iov_count = prep_msg(vp, skb, (struct iovec *) &iov);
+
+ if (iov_count < 1)
+ goto drop;
+ pkt_len = uml_vector_writev(
+ vp->fds->tx_fd,
+ (struct iovec *) &iov,
+ iov_count
+ );
+
+ netif_trans_update(vp->dev);
+ netif_wake_queue(vp->dev);
+
+ if (pkt_len > 0) {
+ vp->dev->stats.tx_bytes += skb->len;
+ vp->dev->stats.tx_packets++;
+ } else {
+ vp->dev->stats.tx_dropped++;
+ }
+ consume_skb(skb);
+ return pkt_len;
+drop:
+ vp->dev->stats.tx_dropped++;
+ consume_skb(skb);
+ return pkt_len;
+}
+
+/*
+ * Receive as many messages as we can in one call using the special
+ * mmsg vector matched to an skb vector which we prepared earlier.
+ */
+
+static int vector_mmsg_rx(struct vector_private *vp)
+{
+ int packet_count, i;
+ struct vector_queue *qi = vp->rx_queue;
+ struct sk_buff *skb;
+ struct mmsghdr *mmsg_vector = qi->mmsg_vector;
+ void **skbuff_vector = qi->skbuff_vector;
+ int header_check;
+
+ /* Refresh the vector and make sure it is with new skbs and the
+ * iovs are updated to point to them.
+ */
+
+ prep_queue_for_rx(qi);
+
+ /* Fire the Lazy Gun - get as many packets as we can in one go. */
+
+ packet_count = uml_vector_recvmmsg(
+ vp->fds->rx_fd, qi->mmsg_vector, qi->max_depth, 0);
+
+ if (packet_count <= 0)
+ return packet_count;
+
+ /* We treat packet processing as enqueue, buffer refresh as dequeue
+ * The queue_depth tells us how many buffers have been used and how
+ * many do we need to prep the next time prep_queue_for_rx() is called.
+ */
+
+ qi->queue_depth = packet_count;
+
+ for (i = 0; i < packet_count; i++) {
+ skb = (*skbuff_vector);
+ if (mmsg_vector->msg_len > vp->header_size) {
+ if (vp->header_size > 0) {
+ header_check = vp->verify_header(
+ mmsg_vector->msg_hdr.msg_iov->iov_base,
+ skb,
+ vp
+ );
+ if (header_check < 0) {
+ /* Overlay header failed to verify - discard.
+ * We can actually keep this skb and reuse it,
+ * but that will make the prep logic too
+ * complex.
+ */
+ dev_kfree_skb_irq(skb);
+ vp->estats.rx_encaps_errors++;
+ continue;
+ }
+ if (header_check > 0) {
+ vp->estats.rx_csum_offload_good++;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ }
+ pskb_trim(skb,
+ mmsg_vector->msg_len - vp->rx_header_size);
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ /*
+ * We do not need to lock on updating stats here
+ * The interrupt loop is non-reentrant.
+ */
+ vp->dev->stats.rx_bytes += skb->len;
+ vp->dev->stats.rx_packets++;
+ netif_rx(skb);
+ } else {
+ /* Overlay header too short to do anything - discard.
+ * We can actually keep this skb and reuse it,
+ * but that will make the prep logic too complex.
+ */
+ if (skb != NULL)
+ dev_kfree_skb_irq(skb);
+ }
+ (*skbuff_vector) = NULL;
+ /* Move to the next buffer element */
+ mmsg_vector++;
+ skbuff_vector++;
+ }
+ if (packet_count > 0) {
+ if (vp->estats.rx_queue_max < packet_count)
+ vp->estats.rx_queue_max = packet_count;
+ vp->estats.rx_queue_running_average =
+ (vp->estats.rx_queue_running_average + packet_count) >> 1;
+ }
+ return packet_count;
+}
+
+static void vector_rx(struct vector_private *vp)
+{
+ int err;
+
+ if ((vp->options & VECTOR_RX) > 0)
+ while ((err = vector_mmsg_rx(vp)) > 0)
+ ;
+ else
+ while ((err = vector_legacy_rx(vp)) > 0)
+ ;
+ if ((err != 0) && net_ratelimit())
+ netdev_err(vp->dev, "vector_rx: error(%d)\n", err);
+}
+
+static int vector_net_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct vector_private *vp = netdev_priv(dev);
+ int queue_depth = 0;
+
+ if ((vp->options & VECTOR_TX) == 0) {
+ writev_tx(vp, skb);
+ return NETDEV_TX_OK;
+ }
+
+ /* We do BQL only in the vector path, no point doing it in
+ * packet at a time mode as there is no device queue
+ */
+
+ netdev_sent_queue(vp->dev, skb->len);
+ queue_depth = vector_enqueue(vp->tx_queue, skb);
+
+ /* if the device queue is full, stop the upper layers and
+ * flush it.
+ */
+
+ if (queue_depth >= vp->tx_queue->max_depth - 1) {
+ vp->estats.tx_kicks++;
+ netif_stop_queue(dev);
+ vector_send(vp->tx_queue);
+ return NETDEV_TX_OK;
+ }
+ if (skb->xmit_more) {
+ mod_timer(&vp->tl, vp->coalesce);
+ return NETDEV_TX_OK;
+ }
+ if (skb->len < TX_SMALL_PACKET) {
+ vp->estats.tx_kicks++;
+ vector_send(vp->tx_queue);
+ } else
+ tasklet_schedule(&vp->tx_poll);
+ return NETDEV_TX_OK;
+}
+
+static irqreturn_t vector_rx_interrupt(int irq, void *dev_id)
+{
+ struct net_device *dev = dev_id;
+ struct vector_private *vp = netdev_priv(dev);
+
+ if (!netif_running(dev))
+ return IRQ_NONE;
+ vector_rx(vp);
+ return IRQ_HANDLED;
+
+}
+
+static irqreturn_t vector_tx_interrupt(int irq, void *dev_id)
+{
+ struct net_device *dev = dev_id;
+ struct vector_private *vp = netdev_priv(dev);
+
+ if (!netif_running(dev))
+ return IRQ_NONE;
+ /* We need to pay attention to it only if we got
+ * -EAGAIN or -ENOBUFFS from sendmmsg. Otherwise
+ * we ignore it. In the future, it may be worth
+ * it to improve the IRQ controller a bit to make
+ * tweaking the IRQ mask less costly
+ */
+
+ if (vp->in_write_poll)
+ tasklet_schedule(&vp->tx_poll);
+ return IRQ_HANDLED;
+
+}
+
+static int irq_rr;
+
+static int vector_net_close(struct net_device *dev)
+{
+ struct vector_private *vp = netdev_priv(dev);
+ unsigned long flags;
+
+ netif_stop_queue(dev);
+ del_timer(&vp->tl);
+
+ if (vp->fds == NULL)
+ return 0;
+
+ /* Disable and free all IRQS */
+ if (vp->rx_irq > 0) {
+ um_free_irq(vp->rx_irq, dev);
+ vp->rx_irq = 0;
+ }
+ if (vp->tx_irq > 0) {
+ um_free_irq(vp->tx_irq, dev);
+ vp->tx_irq = 0;
+ }
+ tasklet_kill(&vp->tx_poll);
+ if (vp->fds->rx_fd > 0) {
+ os_close_file(vp->fds->rx_fd);
+ vp->fds->rx_fd = -1;
+ }
+ if (vp->fds->tx_fd > 0) {
+ os_close_file(vp->fds->tx_fd);
+ vp->fds->tx_fd = -1;
+ }
+ if (vp->bpf != NULL)
+ kfree(vp->bpf);
+ if (vp->fds->remote_addr != NULL)
+ kfree(vp->fds->remote_addr);
+ if (vp->transport_data != NULL)
+ kfree(vp->transport_data);
+ if (vp->header_rxbuffer != NULL)
+ kfree(vp->header_rxbuffer);
+ if (vp->header_txbuffer != NULL)
+ kfree(vp->header_txbuffer);
+ if (vp->rx_queue != NULL)
+ destroy_queue(vp->rx_queue);
+ if (vp->tx_queue != NULL)
+ destroy_queue(vp->tx_queue);
+ kfree(vp->fds);
+ vp->fds = NULL;
+ spin_lock_irqsave(&vp->lock, flags);
+ vp->opened = false;
+ spin_unlock_irqrestore(&vp->lock, flags);
+ return 0;
+}
+
+/* TX tasklet */
+
+static void vector_tx_poll(unsigned long data)
+{
+ struct vector_private *vp = (struct vector_private *)data;
+
+ vp->estats.tx_kicks++;
+ vector_send(vp->tx_queue);
+}
+static void vector_reset_tx(struct work_struct *work)
+{
+ struct vector_private *vp =
+ container_of(work, struct vector_private, reset_tx);
+ netdev_reset_queue(vp->dev);
+ netif_start_queue(vp->dev);
+ netif_wake_queue(vp->dev);
+}
+static int vector_net_open(struct net_device *dev)
+{
+ struct vector_private *vp = netdev_priv(dev);
+ unsigned long flags;
+ int err = -EINVAL;
+ struct vector_device *vdevice;
+
+ spin_lock_irqsave(&vp->lock, flags);
+ if (vp->opened) {
+ spin_unlock_irqrestore(&vp->lock, flags);
+ return -ENXIO;
+ }
+ vp->opened = true;
+ spin_unlock_irqrestore(&vp->lock, flags);
+
+ vp->fds = uml_vector_user_open(vp->unit, vp->parsed);
+
+ if (vp->fds == NULL)
+ goto out_close;
+
+ if (build_transport_data(vp) < 0)
+ goto out_close;
+
+ if ((vp->options & VECTOR_RX) > 0) {
+ vp->rx_queue = create_queue(
+ vp,
+ get_depth(vp->parsed),
+ vp->rx_header_size,
+ MAX_IOV_SIZE
+ );
+ vp->rx_queue->queue_depth = get_depth(vp->parsed);
+ } else {
+ vp->header_rxbuffer = kmalloc(
+ vp->rx_header_size,
+ GFP_KERNEL
+ );
+ if (vp->header_rxbuffer == NULL)
+ goto out_close;
+ }
+ if ((vp->options & VECTOR_TX) > 0) {
+ vp->tx_queue = create_queue(
+ vp,
+ get_depth(vp->parsed),
+ vp->header_size,
+ MAX_IOV_SIZE
+ );
+ } else {
+ vp->header_txbuffer = kmalloc(vp->header_size, GFP_KERNEL);
+ if (vp->header_txbuffer == NULL)
+ goto out_close;
+ }
+
+ /* READ IRQ */
+ err = um_request_irq(
+ irq_rr + VECTOR_BASE_IRQ, vp->fds->rx_fd,
+ IRQ_READ, vector_rx_interrupt,
+ IRQF_SHARED, dev->name, dev);
+ if (err != 0) {
+ netdev_err(dev, "vector_open: failed to get rx irq(%d)\n", err);
+ err = -ENETUNREACH;
+ goto out_close;
+ }
+ vp->rx_irq = irq_rr + VECTOR_BASE_IRQ;
+ dev->irq = irq_rr + VECTOR_BASE_IRQ;
+ irq_rr = (irq_rr + 1) % VECTOR_IRQ_SPACE;
+
+ /* WRITE IRQ - we need it only if we have vector TX */
+ if ((vp->options & VECTOR_TX) > 0) {
+ err = um_request_irq(
+ irq_rr + VECTOR_BASE_IRQ, vp->fds->tx_fd,
+ IRQ_WRITE, vector_tx_interrupt,
+ IRQF_SHARED, dev->name, dev);
+ if (err != 0) {
+ netdev_err(dev,
+ "vector_open: failed to get tx irq(%d)\n", err);
+ err = -ENETUNREACH;
+ goto out_close;
+ }
+ vp->tx_irq = irq_rr + VECTOR_BASE_IRQ;
+ irq_rr = (irq_rr + 1) % VECTOR_IRQ_SPACE;
+ }
+
+ if ((vp->options & VECTOR_QDISC_BYPASS) != 0) {
+ if (!uml_raw_enable_qdisc_bypass(vp->fds->rx_fd))
+ vp->options = vp->options | VECTOR_BPF;
+ }
+
+ if ((vp->options & VECTOR_BPF) != 0)
+ vp->bpf = uml_vector_default_bpf(vp->fds->rx_fd, dev->dev_addr);
+
+ netif_start_queue(dev);
+
+ /* clear buffer - it can happen that the host side of the interface
+ * is full when we get here. In this case, new data is never queued,
+ * SIGIOs never arrive, and the net never works.
+ */
+
+ vector_rx(vp);
+
+ vector_reset_stats(vp);
+ vdevice = find_device(vp->unit);
+ vdevice->opened = 1;
+
+ if ((vp->options & VECTOR_TX) != 0)
+ add_timer(&vp->tl);
+ return 0;
+out_close:
+ vector_net_close(dev);
+ return err;
+}
+
+
+static void vector_net_set_multicast_list(struct net_device *dev)
+{
+ /* TODO: - we can do some BPF games here */
+ return;
+}
+
+static void vector_net_tx_timeout(struct net_device *dev)
+{
+ struct vector_private *vp = netdev_priv(dev);
+
+ vp->estats.tx_timeout_count++;
+ netif_trans_update(dev);
+ schedule_work(&vp->reset_tx);
+}
+
+static netdev_features_t vector_fix_features(struct net_device *dev,
+ netdev_features_t features)
+{
+ features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
+ return features;
+}
+
+static int vector_set_features(struct net_device *dev,
+ netdev_features_t features)
+{
+ struct vector_private *vp = netdev_priv(dev);
+ /* Adjust buffer sizes for GSO/GRO. Unfortunately, there is
+ * no way to negotiate it on raw sockets, so we can change
+ * only our side.
+ */
+ if (features & NETIF_F_GRO)
+ /* All new frame buffers will be GRO-sized */
+ vp->req_size = 65536;
+ else
+ /* All new frame buffers will be normal sized */
+ vp->req_size = vp->max_packet + vp->headroom + SAFETY_MARGIN;
+ return 0;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void vector_net_poll_controller(struct net_device *dev)
+{
+ disable_irq(dev->irq);
+ vector_rx_interrupt(dev->irq, dev);
+ enable_irq(dev->irq);
+}
+#endif
+
+static void vector_net_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *info)
+{
+ strlcpy(info->driver, DRIVER_NAME, sizeof(info->driver));
+ strlcpy(info->version, DRIVER_VERSION, sizeof(info->version));
+}
+
+static void vector_get_ringparam(struct net_device *netdev,
+ struct ethtool_ringparam *ring)
+{
+ struct vector_private *vp = netdev_priv(netdev);
+
+ ring->rx_max_pending = vp->rx_queue->max_depth;
+ ring->tx_max_pending = vp->tx_queue->max_depth;
+ ring->rx_pending = vp->rx_queue->max_depth;
+ ring->tx_pending = vp->tx_queue->max_depth;
+}
+
+static void vector_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
+{
+ switch (stringset) {
+ case ETH_SS_TEST:
+ *buf = '\0';
+ break;
+ case ETH_SS_STATS:
+ memcpy(buf, ðtool_stats_keys, sizeof(ethtool_stats_keys));
+ break;
+ default:
+ WARN_ON(1);
+ break;
+ }
+}
+
+static int vector_get_sset_count(struct net_device *dev, int sset)
+{
+ switch (sset) {
+ case ETH_SS_TEST:
+ return 0;
+ case ETH_SS_STATS:
+ return VECTOR_NUM_STATS;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static void vector_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *estats,
+ u64 *tmp_stats)
+{
+ struct vector_private *vp = netdev_priv(dev);
+
+ memcpy(tmp_stats, &vp->estats, sizeof(struct vector_estats));
+}
+
+static int vector_get_coalesce(struct net_device *netdev,
+ struct ethtool_coalesce *ec)
+{
+ struct vector_private *vp = netdev_priv(netdev);
+
+ ec->tx_coalesce_usecs = (vp->coalesce * 1000000) / HZ;
+ return 0;
+}
+
+static int vector_set_coalesce(struct net_device *netdev,
+ struct ethtool_coalesce *ec)
+{
+ struct vector_private *vp = netdev_priv(netdev);
+
+ vp->coalesce = (ec->tx_coalesce_usecs * HZ) / 1000000;
+ if (vp->coalesce == 0)
+ vp->coalesce = 1;
+ return 0;
+}
+
+static const struct ethtool_ops vector_net_ethtool_ops = {
+ .get_drvinfo = vector_net_get_drvinfo,
+ .get_link = ethtool_op_get_link,
+ .get_ts_info = ethtool_op_get_ts_info,
+ .get_ringparam = vector_get_ringparam,
+ .get_strings = vector_get_strings,
+ .get_sset_count = vector_get_sset_count,
+ .get_ethtool_stats = vector_get_ethtool_stats,
+ .get_coalesce = vector_get_coalesce,
+ .set_coalesce = vector_set_coalesce,
+};
+
+
+static const struct net_device_ops vector_netdev_ops = {
+ .ndo_open = vector_net_open,
+ .ndo_stop = vector_net_close,
+ .ndo_start_xmit = vector_net_start_xmit,
+ .ndo_set_rx_mode = vector_net_set_multicast_list,
+ .ndo_tx_timeout = vector_net_tx_timeout,
+ .ndo_set_mac_address = eth_mac_addr,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_fix_features = vector_fix_features,
+ .ndo_set_features = vector_set_features,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ .ndo_poll_controller = vector_net_poll_controller,
+#endif
+};
+
+
+static void vector_timer_expire(struct timer_list *t)
+{
+ struct vector_private *vp = from_timer(vp, t, tl);
+
+ vp->estats.tx_kicks++;
+ vector_send(vp->tx_queue);
+}
+
+static void vector_eth_configure(
+ int n,
+ struct arglist *def
+ )
+{
+ struct vector_device *device;
+ struct net_device *dev;
+ struct vector_private *vp;
+ int err;
+
+ device = kzalloc(sizeof(*device), GFP_KERNEL);
+ if (device == NULL) {
+ printk(KERN_ERR "eth_configure failed to allocate struct "
+ "vector_device\n");
+ return;
+ }
+ dev = alloc_etherdev(sizeof(struct vector_private));
+ if (dev == NULL) {
+ printk(KERN_ERR "eth_configure: failed to allocate struct "
+ "net_device for vec%d\n", n);
+ goto out_free_device;
+ }
+
+ dev->mtu = get_mtu(def);
+
+ INIT_LIST_HEAD(&device->list);
+ device->unit = n;
+
+ /* If this name ends up conflicting with an existing registered
+ * netdevice, that is OK, register_netdev{,ice}() will notice this
+ * and fail.
+ */
+ snprintf(dev->name, sizeof(dev->name), "vec%d", n);
+ uml_net_setup_etheraddr(dev, uml_vector_fetch_arg(def, "mac"));
+ vp = netdev_priv(dev);
+
+ /* sysfs register */
+ if (!driver_registered) {
+ platform_driver_register(¨_net_driver);
+ driver_registered = 1;
+ }
+ device->pdev.id = n;
+ device->pdev.name = DRIVER_NAME;
+ device->pdev.dev.release = vector_device_release;
+ dev_set_drvdata(&device->pdev.dev, device);
+ if (platform_device_register(&device->pdev))
+ goto out_free_netdev;
+ SET_NETDEV_DEV(dev, &device->pdev.dev);
+
+ device->dev = dev;
+
+ *vp = ((struct vector_private)
+ {
+ .list = LIST_HEAD_INIT(vp->list),
+ .dev = dev,
+ .unit = n,
+ .options = get_transport_options(def),
+ .rx_irq = 0,
+ .tx_irq = 0,
+ .parsed = def,
+ .max_packet = get_mtu(def) + ETH_HEADER_OTHER,
+ /* TODO - we need to calculate headroom so that ip header
+ * is 16 byte aligned all the time
+ */
+ .headroom = get_headroom(def),
+ .form_header = NULL,
+ .verify_header = NULL,
+ .header_rxbuffer = NULL,
+ .header_txbuffer = NULL,
+ .header_size = 0,
+ .rx_header_size = 0,
+ .rexmit_scheduled = false,
+ .opened = false,
+ .transport_data = NULL,
+ .in_write_poll = false,
+ .coalesce = 2,
+ .req_size = get_req_size(def)
+ });
+
+ dev->features = dev->hw_features = (NETIF_F_SG | NETIF_F_FRAGLIST);
+ tasklet_init(&vp->tx_poll, vector_tx_poll, (unsigned long)vp);
+ INIT_WORK(&vp->reset_tx, vector_reset_tx);
+
+ timer_setup(&vp->tl, vector_timer_expire, 0);
+ spin_lock_init(&vp->lock);
+
+ /* FIXME */
+ dev->netdev_ops = &vector_netdev_ops;
+ dev->ethtool_ops = &vector_net_ethtool_ops;
+ dev->watchdog_timeo = (HZ >> 1);
+ /* primary IRQ - fixme */
+ dev->irq = 0; /* we will adjust this once opened */
+
+ rtnl_lock();
+ err = register_netdevice(dev);
+ rtnl_unlock();
+ if (err)
+ goto out_undo_user_init;
+
+ spin_lock(&vector_devices_lock);
+ list_add(&device->list, &vector_devices);
+ spin_unlock(&vector_devices_lock);
+
+ return;
+
+out_undo_user_init:
+ return;
+out_free_netdev:
+ free_netdev(dev);
+out_free_device:
+ kfree(device);
+}
+
+
+
+
+/*
+ * Invoked late in the init
+ */
+
+static int __init vector_init(void)
+{
+ struct list_head *ele;
+ struct vector_cmd_line_arg *def;
+ struct arglist *parsed;
+
+ list_for_each(ele, &vec_cmd_line) {
+ def = list_entry(ele, struct vector_cmd_line_arg, list);
+ parsed = uml_parse_vector_ifspec(def->arguments);
+ if (parsed != NULL)
+ vector_eth_configure(def->unit, parsed);
+ }
+ return 0;
+}
+
+
+/* Invoked at initial argument parsing, only stores
+ * arguments until a proper vector_init is called
+ * later
+ */
+
+static int __init vector_setup(char *str)
+{
+ char *error;
+ int n, err;
+ struct vector_cmd_line_arg *new;
+
+ err = vector_parse(str, &n, &str, &error);
+ if (err) {
+ printk(KERN_ERR "vector_setup - Couldn't parse '%s' : %s\n",
+ str, error);
+ return 1;
+ }
+ new = alloc_bootmem(sizeof(*new));
+ INIT_LIST_HEAD(&new->list);
+ new->unit = n;
+ new->arguments = str;
+ list_add_tail(&new->list, &vec_cmd_line);
+ return 1;
+}
+
+__setup("vec", vector_setup);
+__uml_help(vector_setup,
+"vec[0-9]+:<option>=<value>,<option>=<value>\n"
+" Configure a vector io network device.\n\n"
+);
+
+late_initcall(vector_init);
+
+static struct mc_device vector_mc = {
+ .list = LIST_HEAD_INIT(vector_mc.list),
+ .name = "vec",
+ .config = vector_config,
+ .get_config = NULL,
+ .id = vector_id,
+ .remove = vector_remove,
+};
+
+#ifdef CONFIG_INET
+static int vector_inetaddr_event(
+ struct notifier_block *this,
+ unsigned long event,
+ void *ptr)
+{
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block vector_inetaddr_notifier = {
+ .notifier_call = vector_inetaddr_event,
+};
+
+static void inet_register(void)
+{
+ register_inetaddr_notifier(&vector_inetaddr_notifier);
+}
+#else
+static inline void inet_register(void)
+{
+}
+#endif
+
+static int vector_net_init(void)
+{
+ mconsole_register_dev(&vector_mc);
+ inet_register();
+ return 0;
+}
+
+__initcall(vector_net_init);
+
+
+
--- /dev/null
+/*
+ * Copyright (C) 2002 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#ifndef __UM_VECTOR_KERN_H
+#define __UM_VECTOR_KERN_H
+
+#include <linux/netdevice.h>
+#include <linux/platform_device.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/list.h>
+#include <linux/ctype.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include "vector_user.h"
+
+/* Queue structure specially adapted for multiple enqueue/dequeue
+ * in a mmsgrecv/mmsgsend context
+ */
+
+/* Dequeue method */
+
+#define QUEUE_SENDMSG 0
+#define QUEUE_SENDMMSG 1
+
+#define VECTOR_RX 1
+#define VECTOR_TX (1 << 1)
+#define VECTOR_BPF (1 << 2)
+#define VECTOR_QDISC_BYPASS (1 << 3)
+
+#define ETH_MAX_PACKET 1500
+#define ETH_HEADER_OTHER 32 /* just in case someone decides to go mad on QnQ */
+
+struct vector_queue {
+ struct mmsghdr *mmsg_vector;
+ void **skbuff_vector;
+ /* backlink to device which owns us */
+ struct net_device *dev;
+ spinlock_t head_lock;
+ spinlock_t tail_lock;
+ int queue_depth, head, tail, max_depth, max_iov_frags;
+ short options;
+};
+
+struct vector_estats {
+ uint64_t rx_queue_max;
+ uint64_t rx_queue_running_average;
+ uint64_t tx_queue_max;
+ uint64_t tx_queue_running_average;
+ uint64_t rx_encaps_errors;
+ uint64_t tx_timeout_count;
+ uint64_t tx_restart_queue;
+ uint64_t tx_kicks;
+ uint64_t tx_flow_control_xon;
+ uint64_t tx_flow_control_xoff;
+ uint64_t rx_csum_offload_good;
+ uint64_t rx_csum_offload_errors;
+ uint64_t sg_ok;
+ uint64_t sg_linearized;
+};
+
+#define VERIFY_HEADER_NOK -1
+#define VERIFY_HEADER_OK 0
+#define VERIFY_CSUM_OK 1
+
+struct vector_private {
+ struct list_head list;
+ spinlock_t lock;
+ struct net_device *dev;
+
+ int unit;
+
+ /* Timeout timer in TX */
+
+ struct timer_list tl;
+
+ /* Scheduled "remove device" work */
+ struct work_struct reset_tx;
+ struct vector_fds *fds;
+
+ struct vector_queue *rx_queue;
+ struct vector_queue *tx_queue;
+
+ int rx_irq;
+ int tx_irq;
+
+ struct arglist *parsed;
+
+ void *transport_data; /* transport specific params if needed */
+
+ int max_packet;
+ int req_size; /* different from max packet - used for TSO */
+ int headroom;
+
+ int options;
+
+ /* remote address if any - some transports will leave this as null */
+
+ int header_size;
+ int rx_header_size;
+ int coalesce;
+
+ void *header_rxbuffer;
+ void *header_txbuffer;
+
+ int (*form_header)(uint8_t *header,
+ struct sk_buff *skb, struct vector_private *vp);
+ int (*verify_header)(uint8_t *header,
+ struct sk_buff *skb, struct vector_private *vp);
+
+ spinlock_t stats_lock;
+
+ struct tasklet_struct tx_poll;
+ bool rexmit_scheduled;
+ bool opened;
+ bool in_write_poll;
+
+ /* ethtool stats */
+
+ struct vector_estats estats;
+ void *bpf;
+
+ char user[0];
+};
+
+extern int build_transport_data(struct vector_private *vp);
+
+#endif
--- /dev/null
+/*
+ * Copyright (C) 2017 - Cambridge Greys Limited
+ * Copyright (C) 2011 - 2014 Cisco Systems Inc
+ * Licensed under the GPL.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <asm/byteorder.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/virtio_net.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_byteorder.h>
+#include <linux/netdev_features.h>
+#include "vector_user.h"
+#include "vector_kern.h"
+
+#define GOOD_LINEAR 512
+#define GSO_ERROR "Incoming GSO frames and GRO disabled on the interface"
+
+struct gre_minimal_header {
+ uint16_t header;
+ uint16_t arptype;
+};
+
+
+struct uml_gre_data {
+ uint32_t rx_key;
+ uint32_t tx_key;
+ uint32_t sequence;
+
+ bool ipv6;
+ bool has_sequence;
+ bool pin_sequence;
+ bool checksum;
+ bool key;
+ struct gre_minimal_header expected_header;
+
+ uint32_t checksum_offset;
+ uint32_t key_offset;
+ uint32_t sequence_offset;
+
+};
+
+struct uml_l2tpv3_data {
+ uint64_t rx_cookie;
+ uint64_t tx_cookie;
+ uint64_t rx_session;
+ uint64_t tx_session;
+ uint32_t counter;
+
+ bool udp;
+ bool ipv6;
+ bool has_counter;
+ bool pin_counter;
+ bool cookie;
+ bool cookie_is_64;
+
+ uint32_t cookie_offset;
+ uint32_t session_offset;
+ uint32_t counter_offset;
+};
+
+static int l2tpv3_form_header(uint8_t *header,
+ struct sk_buff *skb, struct vector_private *vp)
+{
+ struct uml_l2tpv3_data *td = vp->transport_data;
+ uint32_t *counter;
+
+ if (td->udp)
+ *(uint32_t *) header = cpu_to_be32(L2TPV3_DATA_PACKET);
+ (*(uint32_t *) (header + td->session_offset)) = td->tx_session;
+
+ if (td->cookie) {
+ if (td->cookie_is_64)
+ (*(uint64_t *)(header + td->cookie_offset)) =
+ td->tx_cookie;
+ else
+ (*(uint32_t *)(header + td->cookie_offset)) =
+ td->tx_cookie;
+ }
+ if (td->has_counter) {
+ counter = (uint32_t *)(header + td->counter_offset);
+ if (td->pin_counter) {
+ *counter = 0;
+ } else {
+ td->counter++;
+ *counter = cpu_to_be32(td->counter);
+ }
+ }
+ return 0;
+}
+
+static int gre_form_header(uint8_t *header,
+ struct sk_buff *skb, struct vector_private *vp)
+{
+ struct uml_gre_data *td = vp->transport_data;
+ uint32_t *sequence;
+ *((uint32_t *) header) = *((uint32_t *) &td->expected_header);
+ if (td->key)
+ (*(uint32_t *) (header + td->key_offset)) = td->tx_key;
+ if (td->has_sequence) {
+ sequence = (uint32_t *)(header + td->sequence_offset);
+ if (td->pin_sequence)
+ *sequence = 0;
+ else
+ *sequence = cpu_to_be32(++td->sequence);
+ }
+ return 0;
+}
+
+static int raw_form_header(uint8_t *header,
+ struct sk_buff *skb, struct vector_private *vp)
+{
+ struct virtio_net_hdr *vheader = (struct virtio_net_hdr *) header;
+
+ virtio_net_hdr_from_skb(
+ skb,
+ vheader,
+ virtio_legacy_is_little_endian(),
+ false
+ );
+
+ return 0;
+}
+
+static int l2tpv3_verify_header(
+ uint8_t *header, struct sk_buff *skb, struct vector_private *vp)
+{
+ struct uml_l2tpv3_data *td = vp->transport_data;
+ uint32_t *session;
+ uint64_t cookie;
+
+ if ((!td->udp) && (!td->ipv6))
+ header += sizeof(struct iphdr) /* fix for ipv4 raw */;
+
+ /* we do not do a strict check for "data" packets as per
+ * the RFC spec because the pure IP spec does not have
+ * that anyway.
+ */
+
+ if (td->cookie) {
+ if (td->cookie_is_64)
+ cookie = *(uint64_t *)(header + td->cookie_offset);
+ else
+ cookie = *(uint32_t *)(header + td->cookie_offset);
+ if (cookie != td->rx_cookie) {
+ if (net_ratelimit())
+ netdev_err(vp->dev, "uml_l2tpv3: unknown cookie id");
+ return -1;
+ }
+ }
+ session = (uint32_t *) (header + td->session_offset);
+ if (*session != td->rx_session) {
+ if (net_ratelimit())
+ netdev_err(vp->dev, "uml_l2tpv3: session mismatch");
+ return -1;
+ }
+ return 0;
+}
+
+static int gre_verify_header(
+ uint8_t *header, struct sk_buff *skb, struct vector_private *vp)
+{
+
+ uint32_t key;
+ struct uml_gre_data *td = vp->transport_data;
+
+ if (!td->ipv6)
+ header += sizeof(struct iphdr) /* fix for ipv4 raw */;
+
+ if (*((uint32_t *) header) != *((uint32_t *) &td->expected_header)) {
+ if (net_ratelimit())
+ netdev_err(vp->dev, "header type disagreement, expecting %0x, got %0x",
+ *((uint32_t *) &td->expected_header),
+ *((uint32_t *) header)
+ );
+ return -1;
+ }
+
+ if (td->key) {
+ key = (*(uint32_t *)(header + td->key_offset));
+ if (key != td->rx_key) {
+ if (net_ratelimit())
+ netdev_err(vp->dev, "unknown key id %0x, expecting %0x",
+ key, td->rx_key);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int raw_verify_header(
+ uint8_t *header, struct sk_buff *skb, struct vector_private *vp)
+{
+ struct virtio_net_hdr *vheader = (struct virtio_net_hdr *) header;
+
+ if ((vheader->gso_type != VIRTIO_NET_HDR_GSO_NONE) &&
+ (vp->req_size != 65536)) {
+ if (net_ratelimit())
+ netdev_err(
+ vp->dev,
+ GSO_ERROR
+ );
+ }
+ if ((vheader->flags & VIRTIO_NET_HDR_F_DATA_VALID) > 0)
+ return 1;
+
+ virtio_net_hdr_to_skb(skb, vheader, virtio_legacy_is_little_endian());
+ return 0;
+}
+
+static bool get_uint_param(
+ struct arglist *def, char *param, unsigned int *result)
+{
+ char *arg = uml_vector_fetch_arg(def, param);
+
+ if (arg != NULL) {
+ if (kstrtoint(arg, 0, result) == 0)
+ return true;
+ }
+ return false;
+}
+
+static bool get_ulong_param(
+ struct arglist *def, char *param, unsigned long *result)
+{
+ char *arg = uml_vector_fetch_arg(def, param);
+
+ if (arg != NULL) {
+ if (kstrtoul(arg, 0, result) == 0)
+ return true;
+ return true;
+ }
+ return false;
+}
+
+static int build_gre_transport_data(struct vector_private *vp)
+{
+ struct uml_gre_data *td;
+ int temp_int;
+ int temp_rx;
+ int temp_tx;
+
+ vp->transport_data = kmalloc(sizeof(struct uml_gre_data), GFP_KERNEL);
+ if (vp->transport_data == NULL)
+ return -ENOMEM;
+ td = vp->transport_data;
+ td->sequence = 0;
+
+ td->expected_header.arptype = GRE_IRB;
+ td->expected_header.header = 0;
+
+ vp->form_header = &gre_form_header;
+ vp->verify_header = &gre_verify_header;
+ vp->header_size = 4;
+ td->key_offset = 4;
+ td->sequence_offset = 4;
+ td->checksum_offset = 4;
+
+ td->ipv6 = false;
+ if (get_uint_param(vp->parsed, "v6", &temp_int)) {
+ if (temp_int > 0)
+ td->ipv6 = true;
+ }
+ td->key = false;
+ if (get_uint_param(vp->parsed, "rx_key", &temp_rx)) {
+ if (get_uint_param(vp->parsed, "tx_key", &temp_tx)) {
+ td->key = true;
+ td->expected_header.header |= GRE_MODE_KEY;
+ td->rx_key = cpu_to_be32(temp_rx);
+ td->tx_key = cpu_to_be32(temp_tx);
+ vp->header_size += 4;
+ td->sequence_offset += 4;
+ } else {
+ return -EINVAL;
+ }
+ }
+
+ td->sequence = false;
+ if (get_uint_param(vp->parsed, "sequence", &temp_int)) {
+ if (temp_int > 0) {
+ vp->header_size += 4;
+ td->has_sequence = true;
+ td->expected_header.header |= GRE_MODE_SEQUENCE;
+ if (get_uint_param(
+ vp->parsed, "pin_sequence", &temp_int)) {
+ if (temp_int > 0)
+ td->pin_sequence = true;
+ }
+ }
+ }
+ vp->rx_header_size = vp->header_size;
+ if (!td->ipv6)
+ vp->rx_header_size += sizeof(struct iphdr);
+ return 0;
+}
+
+static int build_l2tpv3_transport_data(struct vector_private *vp)
+{
+
+ struct uml_l2tpv3_data *td;
+ int temp_int, temp_rxs, temp_txs;
+ unsigned long temp_rx;
+ unsigned long temp_tx;
+
+ vp->transport_data = kmalloc(
+ sizeof(struct uml_l2tpv3_data), GFP_KERNEL);
+
+ if (vp->transport_data == NULL)
+ return -ENOMEM;
+
+ td = vp->transport_data;
+
+ vp->form_header = &l2tpv3_form_header;
+ vp->verify_header = &l2tpv3_verify_header;
+ td->counter = 0;
+
+ vp->header_size = 4;
+ td->session_offset = 0;
+ td->cookie_offset = 4;
+ td->counter_offset = 4;
+
+
+ td->ipv6 = false;
+ if (get_uint_param(vp->parsed, "v6", &temp_int)) {
+ if (temp_int > 0)
+ td->ipv6 = true;
+ }
+
+ if (get_uint_param(vp->parsed, "rx_session", &temp_rxs)) {
+ if (get_uint_param(vp->parsed, "tx_session", &temp_txs)) {
+ td->tx_session = cpu_to_be32(temp_txs);
+ td->rx_session = cpu_to_be32(temp_rxs);
+ } else {
+ return -EINVAL;
+ }
+ } else {
+ return -EINVAL;
+ }
+
+ td->cookie_is_64 = false;
+ if (get_uint_param(vp->parsed, "cookie64", &temp_int)) {
+ if (temp_int > 0)
+ td->cookie_is_64 = true;
+ }
+ td->cookie = false;
+ if (get_ulong_param(vp->parsed, "rx_cookie", &temp_rx)) {
+ if (get_ulong_param(vp->parsed, "tx_cookie", &temp_tx)) {
+ td->cookie = true;
+ if (td->cookie_is_64) {
+ td->rx_cookie = cpu_to_be64(temp_rx);
+ td->tx_cookie = cpu_to_be64(temp_tx);
+ vp->header_size += 8;
+ td->counter_offset += 8;
+ } else {
+ td->rx_cookie = cpu_to_be32(temp_rx);
+ td->tx_cookie = cpu_to_be32(temp_tx);
+ vp->header_size += 4;
+ td->counter_offset += 4;
+ }
+ } else {
+ return -EINVAL;
+ }
+ }
+
+ td->has_counter = false;
+ if (get_uint_param(vp->parsed, "counter", &temp_int)) {
+ if (temp_int > 0) {
+ td->has_counter = true;
+ vp->header_size += 4;
+ if (get_uint_param(
+ vp->parsed, "pin_counter", &temp_int)) {
+ if (temp_int > 0)
+ td->pin_counter = true;
+ }
+ }
+ }
+
+ if (get_uint_param(vp->parsed, "udp", &temp_int)) {
+ if (temp_int > 0) {
+ td->udp = true;
+ vp->header_size += 4;
+ td->counter_offset += 4;
+ td->session_offset += 4;
+ td->cookie_offset += 4;
+ }
+ }
+
+ vp->rx_header_size = vp->header_size;
+ if ((!td->ipv6) && (!td->udp))
+ vp->rx_header_size += sizeof(struct iphdr);
+
+ return 0;
+}
+
+static int build_raw_transport_data(struct vector_private *vp)
+{
+ if (uml_raw_enable_vnet_headers(vp->fds->rx_fd)) {
+ if (!uml_raw_enable_vnet_headers(vp->fds->tx_fd))
+ return -1;
+ vp->form_header = &raw_form_header;
+ vp->verify_header = &raw_verify_header;
+ vp->header_size = sizeof(struct virtio_net_hdr);
+ vp->rx_header_size = sizeof(struct virtio_net_hdr);
+ vp->dev->hw_features |= (NETIF_F_TSO | NETIF_F_GRO);
+ vp->dev->features |=
+ (NETIF_F_RXCSUM | NETIF_F_HW_CSUM |
+ NETIF_F_TSO | NETIF_F_GRO);
+ netdev_info(
+ vp->dev,
+ "raw: using vnet headers for tso and tx/rx checksum"
+ );
+ }
+ return 0;
+}
+
+static int build_tap_transport_data(struct vector_private *vp)
+{
+ if (uml_raw_enable_vnet_headers(vp->fds->rx_fd)) {
+ vp->form_header = &raw_form_header;
+ vp->verify_header = &raw_verify_header;
+ vp->header_size = sizeof(struct virtio_net_hdr);
+ vp->rx_header_size = sizeof(struct virtio_net_hdr);
+ vp->dev->hw_features |=
+ (NETIF_F_TSO | NETIF_F_GSO | NETIF_F_GRO);
+ vp->dev->features |=
+ (NETIF_F_RXCSUM | NETIF_F_HW_CSUM |
+ NETIF_F_TSO | NETIF_F_GSO | NETIF_F_GRO);
+ netdev_info(
+ vp->dev,
+ "tap/raw: using vnet headers for tso and tx/rx checksum"
+ );
+ } else {
+ return 0; /* do not try to enable tap too if raw failed */
+ }
+ if (uml_tap_enable_vnet_headers(vp->fds->tx_fd))
+ return 0;
+ return -1;
+}
+
+int build_transport_data(struct vector_private *vp)
+{
+ char *transport = uml_vector_fetch_arg(vp->parsed, "transport");
+
+ if (strncmp(transport, TRANS_GRE, TRANS_GRE_LEN) == 0)
+ return build_gre_transport_data(vp);
+ if (strncmp(transport, TRANS_L2TPV3, TRANS_L2TPV3_LEN) == 0)
+ return build_l2tpv3_transport_data(vp);
+ if (strncmp(transport, TRANS_RAW, TRANS_RAW_LEN) == 0)
+ return build_raw_transport_data(vp);
+ if (strncmp(transport, TRANS_TAP, TRANS_TAP_LEN) == 0)
+ return build_tap_transport_data(vp);
+ return 0;
+}
+
--- /dev/null
+/*
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include <linux/if_tun.h>
+#include <arpa/inet.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <net/ethernet.h>
+#include <netinet/ip.h>
+#include <netinet/ether.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+#include <linux/virtio_net.h>
+#include <netdb.h>
+#include <stdlib.h>
+#include <os.h>
+#include <um_malloc.h>
+#include "vector_user.h"
+
+#define ID_GRE 0
+#define ID_L2TPV3 1
+#define ID_MAX 1
+
+#define TOKEN_IFNAME "ifname"
+
+#define TRANS_RAW "raw"
+#define TRANS_RAW_LEN strlen(TRANS_RAW)
+
+#define VNET_HDR_FAIL "could not enable vnet headers on fd %d"
+#define TUN_GET_F_FAIL "tapraw: TUNGETFEATURES failed: %s"
+#define L2TPV3_BIND_FAIL "l2tpv3_open : could not bind socket err=%i"
+#define BPF_ATTACH_FAIL "Failed to attach filter size %d to %d, err %d\n"
+
+/* This is very ugly and brute force lookup, but it is done
+ * only once at initialization so not worth doing hashes or
+ * anything more intelligent
+ */
+
+char *uml_vector_fetch_arg(struct arglist *ifspec, char *token)
+{
+ int i;
+
+ for (i = 0; i < ifspec->numargs; i++) {
+ if (strcmp(ifspec->tokens[i], token) == 0)
+ return ifspec->values[i];
+ }
+ return NULL;
+
+}
+
+struct arglist *uml_parse_vector_ifspec(char *arg)
+{
+ struct arglist *result;
+ int pos, len;
+ bool parsing_token = true, next_starts = true;
+
+ if (arg == NULL)
+ return NULL;
+ result = uml_kmalloc(sizeof(struct arglist), UM_GFP_KERNEL);
+ if (result == NULL)
+ return NULL;
+ result->numargs = 0;
+ len = strlen(arg);
+ for (pos = 0; pos < len; pos++) {
+ if (next_starts) {
+ if (parsing_token) {
+ result->tokens[result->numargs] = arg + pos;
+ } else {
+ result->values[result->numargs] = arg + pos;
+ result->numargs++;
+ }
+ next_starts = false;
+ }
+ if (*(arg + pos) == '=') {
+ if (parsing_token)
+ parsing_token = false;
+ else
+ goto cleanup;
+ next_starts = true;
+ (*(arg + pos)) = '\0';
+ }
+ if (*(arg + pos) == ',') {
+ parsing_token = true;
+ next_starts = true;
+ (*(arg + pos)) = '\0';
+ }
+ }
+ return result;
+cleanup:
+ printk(UM_KERN_ERR "vector_setup - Couldn't parse '%s'\n", arg);
+ kfree(result);
+ return NULL;
+}
+
+/*
+ * Socket/FD configuration functions. These return an structure
+ * of rx and tx descriptors to cover cases where these are not
+ * the same (f.e. read via raw socket and write via tap).
+ */
+
+#define PATH_NET_TUN "/dev/net/tun"
+
+static struct vector_fds *user_init_tap_fds(struct arglist *ifspec)
+{
+ struct ifreq ifr;
+ int fd = -1;
+ struct sockaddr_ll sock;
+ int err = -ENOMEM, offload;
+ char *iface;
+ struct vector_fds *result = NULL;
+
+ iface = uml_vector_fetch_arg(ifspec, TOKEN_IFNAME);
+ if (iface == NULL) {
+ printk(UM_KERN_ERR "uml_tap: failed to parse interface spec\n");
+ goto tap_cleanup;
+ }
+
+ result = uml_kmalloc(sizeof(struct vector_fds), UM_GFP_KERNEL);
+ if (result == NULL) {
+ printk(UM_KERN_ERR "uml_tap: failed to allocate file descriptors\n");
+ goto tap_cleanup;
+ }
+ result->rx_fd = -1;
+ result->tx_fd = -1;
+ result->remote_addr = NULL;
+ result->remote_addr_size = 0;
+
+ /* TAP */
+
+ fd = open(PATH_NET_TUN, O_RDWR);
+ if (fd < 0) {
+ printk(UM_KERN_ERR "uml_tap: failed to open tun device\n");
+ goto tap_cleanup;
+ }
+ result->tx_fd = fd;
+ memset(&ifr, 0, sizeof(ifr));
+ ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+ strncpy((char *)&ifr.ifr_name, iface, sizeof(ifr.ifr_name) - 1);
+
+ err = ioctl(fd, TUNSETIFF, (void *) &ifr);
+ if (err != 0) {
+ printk(UM_KERN_ERR "uml_tap: failed to select tap interface\n");
+ goto tap_cleanup;
+ }
+
+ offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6;
+ ioctl(fd, TUNSETOFFLOAD, offload);
+
+ /* RAW */
+
+ fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
+ if (fd == -1) {
+ printk(UM_KERN_ERR
+ "uml_tap: failed to create socket: %i\n", -errno);
+ goto tap_cleanup;
+ }
+ result->rx_fd = fd;
+ memset(&ifr, 0, sizeof(ifr));
+ strncpy((char *)&ifr.ifr_name, iface, sizeof(ifr.ifr_name) - 1);
+ if (ioctl(fd, SIOCGIFINDEX, (void *) &ifr) < 0) {
+ printk(UM_KERN_ERR
+ "uml_tap: failed to set interface: %i\n", -errno);
+ goto tap_cleanup;
+ }
+
+ sock.sll_family = AF_PACKET;
+ sock.sll_protocol = htons(ETH_P_ALL);
+ sock.sll_ifindex = ifr.ifr_ifindex;
+
+ if (bind(fd,
+ (struct sockaddr *) &sock, sizeof(struct sockaddr_ll)) < 0) {
+ printk(UM_KERN_ERR
+ "user_init_tap: failed to bind raw pair, err %d\n",
+ -errno);
+ goto tap_cleanup;
+ }
+ return result;
+tap_cleanup:
+ printk(UM_KERN_ERR "user_init_tap: init failed, error %d", err);
+ if (result != NULL) {
+ if (result->rx_fd >= 0)
+ os_close_file(result->rx_fd);
+ if (result->tx_fd >= 0)
+ os_close_file(result->tx_fd);
+ kfree(result);
+ }
+ return NULL;
+}
+
+
+static struct vector_fds *user_init_raw_fds(struct arglist *ifspec)
+{
+ struct ifreq ifr;
+ int rxfd = -1, txfd = -1;
+ struct sockaddr_ll sock;
+ int err = -ENOMEM;
+ char *iface;
+ struct vector_fds *result = NULL;
+
+ iface = uml_vector_fetch_arg(ifspec, TOKEN_IFNAME);
+ if (iface == NULL)
+ goto cleanup;
+
+ rxfd = socket(AF_PACKET, SOCK_RAW, ETH_P_ALL);
+ if (rxfd == -1) {
+ err = -errno;
+ goto cleanup;
+ }
+ txfd = socket(AF_PACKET, SOCK_RAW, 0); /* Turn off RX on this fd */
+ if (txfd == -1) {
+ err = -errno;
+ goto cleanup;
+ }
+ memset(&ifr, 0, sizeof(ifr));
+ strncpy((char *)&ifr.ifr_name, iface, sizeof(ifr.ifr_name) - 1);
+ if (ioctl(rxfd, SIOCGIFINDEX, (void *) &ifr) < 0) {
+ err = -errno;
+ goto cleanup;
+ }
+
+ sock.sll_family = AF_PACKET;
+ sock.sll_protocol = htons(ETH_P_ALL);
+ sock.sll_ifindex = ifr.ifr_ifindex;
+
+ if (bind(rxfd,
+ (struct sockaddr *) &sock, sizeof(struct sockaddr_ll)) < 0) {
+ err = -errno;
+ goto cleanup;
+ }
+
+ sock.sll_family = AF_PACKET;
+ sock.sll_protocol = htons(ETH_P_IP);
+ sock.sll_ifindex = ifr.ifr_ifindex;
+
+ if (bind(txfd,
+ (struct sockaddr *) &sock, sizeof(struct sockaddr_ll)) < 0) {
+ err = -errno;
+ goto cleanup;
+ }
+
+ result = uml_kmalloc(sizeof(struct vector_fds), UM_GFP_KERNEL);
+ if (result != NULL) {
+ result->rx_fd = rxfd;
+ result->tx_fd = txfd;
+ result->remote_addr = NULL;
+ result->remote_addr_size = 0;
+ }
+ return result;
+cleanup:
+ printk(UM_KERN_ERR "user_init_raw: init failed, error %d", err);
+ if (rxfd >= 0)
+ os_close_file(rxfd);
+ if (txfd >= 0)
+ os_close_file(txfd);
+ if (result != NULL)
+ kfree(result);
+ return NULL;
+}
+
+
+bool uml_raw_enable_qdisc_bypass(int fd)
+{
+ int optval = 1;
+
+ if (setsockopt(fd,
+ SOL_PACKET, PACKET_QDISC_BYPASS,
+ &optval, sizeof(optval)) != 0) {
+ return false;
+ }
+ return true;
+}
+
+bool uml_raw_enable_vnet_headers(int fd)
+{
+ int optval = 1;
+
+ if (setsockopt(fd,
+ SOL_PACKET, PACKET_VNET_HDR,
+ &optval, sizeof(optval)) != 0) {
+ printk(UM_KERN_INFO VNET_HDR_FAIL, fd);
+ return false;
+ }
+ return true;
+}
+bool uml_tap_enable_vnet_headers(int fd)
+{
+ unsigned int features;
+ int len = sizeof(struct virtio_net_hdr);
+
+ if (ioctl(fd, TUNGETFEATURES, &features) == -1) {
+ printk(UM_KERN_INFO TUN_GET_F_FAIL, strerror(errno));
+ return false;
+ }
+ if ((features & IFF_VNET_HDR) == 0) {
+ printk(UM_KERN_INFO "tapraw: No VNET HEADER support");
+ return false;
+ }
+ ioctl(fd, TUNSETVNETHDRSZ, &len);
+ return true;
+}
+
+static struct vector_fds *user_init_socket_fds(struct arglist *ifspec, int id)
+{
+ int err = -ENOMEM;
+ int fd = -1, gairet;
+ struct addrinfo srchints;
+ struct addrinfo dsthints;
+ bool v6, udp;
+ char *value;
+ char *src, *dst, *srcport, *dstport;
+ struct addrinfo *gairesult = NULL;
+ struct vector_fds *result = NULL;
+
+
+ value = uml_vector_fetch_arg(ifspec, "v6");
+ v6 = false;
+ udp = false;
+ if (value != NULL) {
+ if (strtol((const char *) value, NULL, 10) > 0)
+ v6 = true;
+ }
+
+ value = uml_vector_fetch_arg(ifspec, "udp");
+ if (value != NULL) {
+ if (strtol((const char *) value, NULL, 10) > 0)
+ udp = true;
+ }
+ src = uml_vector_fetch_arg(ifspec, "src");
+ dst = uml_vector_fetch_arg(ifspec, "dst");
+ srcport = uml_vector_fetch_arg(ifspec, "srcport");
+ dstport = uml_vector_fetch_arg(ifspec, "dstport");
+
+ memset(&dsthints, 0, sizeof(dsthints));
+
+ if (v6)
+ dsthints.ai_family = AF_INET6;
+ else
+ dsthints.ai_family = AF_INET;
+
+ switch (id) {
+ case ID_GRE:
+ dsthints.ai_socktype = SOCK_RAW;
+ dsthints.ai_protocol = IPPROTO_GRE;
+ break;
+ case ID_L2TPV3:
+ if (udp) {
+ dsthints.ai_socktype = SOCK_DGRAM;
+ dsthints.ai_protocol = 0;
+ } else {
+ dsthints.ai_socktype = SOCK_RAW;
+ dsthints.ai_protocol = IPPROTO_L2TP;
+ }
+ break;
+ default:
+ printk(KERN_ERR "Unsupported socket type\n");
+ return NULL;
+ }
+ memcpy(&srchints, &dsthints, sizeof(struct addrinfo));
+
+ gairet = getaddrinfo(src, srcport, &dsthints, &gairesult);
+ if ((gairet != 0) || (gairesult == NULL)) {
+ printk(UM_KERN_ERR
+ "socket_open : could not resolve src, error = %s",
+ gai_strerror(gairet)
+ );
+ return NULL;
+ }
+ fd = socket(gairesult->ai_family,
+ gairesult->ai_socktype, gairesult->ai_protocol);
+ if (fd == -1) {
+ printk(UM_KERN_ERR
+ "socket_open : could not open socket, error = %d",
+ -errno
+ );
+ goto cleanup;
+ }
+ if (bind(fd,
+ (struct sockaddr *) gairesult->ai_addr,
+ gairesult->ai_addrlen)) {
+ printk(UM_KERN_ERR L2TPV3_BIND_FAIL, errno);
+ goto cleanup;
+ }
+
+ if (gairesult != NULL)
+ freeaddrinfo(gairesult);
+
+ gairesult = NULL;
+
+ gairet = getaddrinfo(dst, dstport, &dsthints, &gairesult);
+ if ((gairet != 0) || (gairesult == NULL)) {
+ printk(UM_KERN_ERR
+ "socket_open : could not resolve dst, error = %s",
+ gai_strerror(gairet)
+ );
+ return NULL;
+ }
+
+ result = uml_kmalloc(sizeof(struct vector_fds), UM_GFP_KERNEL);
+ if (result != NULL) {
+ result->rx_fd = fd;
+ result->tx_fd = fd;
+ result->remote_addr = uml_kmalloc(
+ gairesult->ai_addrlen, UM_GFP_KERNEL);
+ if (result->remote_addr == NULL)
+ goto cleanup;
+ result->remote_addr_size = gairesult->ai_addrlen;
+ memcpy(
+ result->remote_addr,
+ gairesult->ai_addr,
+ gairesult->ai_addrlen
+ );
+ }
+ freeaddrinfo(gairesult);
+ return result;
+cleanup:
+ if (gairesult != NULL)
+ freeaddrinfo(gairesult);
+ printk(UM_KERN_ERR "user_init_socket: init failed, error %d", err);
+ if (fd >= 0)
+ os_close_file(fd);
+ if (result != NULL) {
+ if (result->remote_addr != NULL)
+ kfree(result->remote_addr);
+ kfree(result);
+ }
+ return NULL;
+}
+
+struct vector_fds *uml_vector_user_open(
+ int unit,
+ struct arglist *parsed
+)
+{
+ char *transport;
+
+ if (parsed == NULL) {
+ printk(UM_KERN_ERR "no parsed config for unit %d\n", unit);
+ return NULL;
+ }
+ transport = uml_vector_fetch_arg(parsed, "transport");
+ if (transport == NULL) {
+ printk(UM_KERN_ERR "missing transport for unit %d\n", unit);
+ return NULL;
+ }
+ if (strncmp(transport, TRANS_RAW, TRANS_RAW_LEN) == 0)
+ return user_init_raw_fds(parsed);
+ if (strncmp(transport, TRANS_TAP, TRANS_TAP_LEN) == 0)
+ return user_init_tap_fds(parsed);
+ if (strncmp(transport, TRANS_GRE, TRANS_GRE_LEN) == 0)
+ return user_init_socket_fds(parsed, ID_GRE);
+ if (strncmp(transport, TRANS_L2TPV3, TRANS_L2TPV3_LEN) == 0)
+ return user_init_socket_fds(parsed, ID_L2TPV3);
+ return NULL;
+}
+
+
+int uml_vector_sendmsg(int fd, void *hdr, int flags)
+{
+ int n;
+
+ CATCH_EINTR(n = sendmsg(fd, (struct msghdr *) hdr, flags));
+ if ((n < 0) && (errno == EAGAIN))
+ return 0;
+ if (n >= 0)
+ return n;
+ else
+ return -errno;
+}
+
+int uml_vector_recvmsg(int fd, void *hdr, int flags)
+{
+ int n;
+
+ CATCH_EINTR(n = recvmsg(fd, (struct msghdr *) hdr, flags));
+ if ((n < 0) && (errno == EAGAIN))
+ return 0;
+ if (n >= 0)
+ return n;
+ else
+ return -errno;
+}
+
+int uml_vector_writev(int fd, void *hdr, int iovcount)
+{
+ int n;
+
+ CATCH_EINTR(n = writev(fd, (struct iovec *) hdr, iovcount));
+ if ((n < 0) && (errno == EAGAIN))
+ return 0;
+ if (n >= 0)
+ return n;
+ else
+ return -errno;
+}
+
+int uml_vector_sendmmsg(
+ int fd,
+ void *msgvec,
+ unsigned int vlen,
+ unsigned int flags)
+{
+ int n;
+
+ CATCH_EINTR(n = sendmmsg(fd, (struct mmsghdr *) msgvec, vlen, flags));
+ if ((n < 0) && (errno == EAGAIN))
+ return 0;
+ if (n >= 0)
+ return n;
+ else
+ return -errno;
+}
+
+int uml_vector_recvmmsg(
+ int fd,
+ void *msgvec,
+ unsigned int vlen,
+ unsigned int flags)
+{
+ int n;
+
+ CATCH_EINTR(
+ n = recvmmsg(fd, (struct mmsghdr *) msgvec, vlen, flags, 0));
+ if ((n < 0) && (errno == EAGAIN))
+ return 0;
+ if (n >= 0)
+ return n;
+ else
+ return -errno;
+}
+int uml_vector_attach_bpf(int fd, void *bpf, int bpf_len)
+{
+ int err = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, bpf, bpf_len);
+
+ if (err < 0)
+ printk(KERN_ERR BPF_ATTACH_FAIL, bpf_len, fd, -errno);
+ return err;
+}
+
+#define DEFAULT_BPF_LEN 6
+
+void *uml_vector_default_bpf(int fd, void *mac)
+{
+ struct sock_filter *bpf;
+ uint32_t *mac1 = (uint32_t *)(mac + 2);
+ uint16_t *mac2 = (uint16_t *) mac;
+ struct sock_fprog bpf_prog = {
+ .len = 6,
+ .filter = NULL,
+ };
+
+ bpf = uml_kmalloc(
+ sizeof(struct sock_filter) * DEFAULT_BPF_LEN, UM_GFP_KERNEL);
+ if (bpf != NULL) {
+ bpf_prog.filter = bpf;
+ /* ld [8] */
+ bpf[0] = (struct sock_filter){ 0x20, 0, 0, 0x00000008 };
+ /* jeq #0xMAC[2-6] jt 2 jf 5*/
+ bpf[1] = (struct sock_filter){ 0x15, 0, 3, ntohl(*mac1)};
+ /* ldh [6] */
+ bpf[2] = (struct sock_filter){ 0x28, 0, 0, 0x00000006 };
+ /* jeq #0xMAC[0-1] jt 4 jf 5 */
+ bpf[3] = (struct sock_filter){ 0x15, 0, 1, ntohs(*mac2)};
+ /* ret #0 */
+ bpf[4] = (struct sock_filter){ 0x6, 0, 0, 0x00000000 };
+ /* ret #0x40000 */
+ bpf[5] = (struct sock_filter){ 0x6, 0, 0, 0x00040000 };
+ if (uml_vector_attach_bpf(
+ fd, &bpf_prog, sizeof(struct sock_fprog)) < 0) {
+ kfree(bpf);
+ bpf = NULL;
+ }
+ }
+ return bpf;
+}
+
--- /dev/null
+/*
+ * Copyright (C) 2002 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#ifndef __UM_VECTOR_USER_H
+#define __UM_VECTOR_USER_H
+
+#define MAXVARGS 20
+
+#define TOKEN_IFNAME "ifname"
+
+#define TRANS_RAW "raw"
+#define TRANS_RAW_LEN strlen(TRANS_RAW)
+
+#define TRANS_TAP "tap"
+#define TRANS_TAP_LEN strlen(TRANS_TAP)
+
+
+#define TRANS_GRE "gre"
+#define TRANS_GRE_LEN strlen(TRANS_RAW)
+
+#define TRANS_L2TPV3 "l2tpv3"
+#define TRANS_L2TPV3_LEN strlen(TRANS_L2TPV3)
+
+#ifndef IPPROTO_GRE
+#define IPPROTO_GRE 0x2F
+#endif
+
+#define GRE_MODE_CHECKSUM cpu_to_be16(8 << 12) /* checksum */
+#define GRE_MODE_RESERVED cpu_to_be16(4 << 12) /* unused */
+#define GRE_MODE_KEY cpu_to_be16(2 << 12) /* KEY present */
+#define GRE_MODE_SEQUENCE cpu_to_be16(1 << 12) /* sequence */
+
+#define GRE_IRB cpu_to_be16(0x6558)
+
+#define L2TPV3_DATA_PACKET 0x30000
+
+/* IANA-assigned IP protocol ID for L2TPv3 */
+
+#ifndef IPPROTO_L2TP
+#define IPPROTO_L2TP 0x73
+#endif
+
+struct arglist {
+ int numargs;
+ char *tokens[MAXVARGS];
+ char *values[MAXVARGS];
+};
+
+/* Separating read and write FDs allows us to have different
+ * rx and tx method. Example - read tap via raw socket using
+ * recvmmsg, write using legacy tap write calls
+ */
+
+struct vector_fds {
+ int rx_fd;
+ int tx_fd;
+ void *remote_addr;
+ int remote_addr_size;
+};
+
+#define VECTOR_READ 1
+#define VECTOR_WRITE (1 < 1)
+#define VECTOR_HEADERS (1 < 2)
+
+extern struct arglist *uml_parse_vector_ifspec(char *arg);
+
+extern struct vector_fds *uml_vector_user_open(
+ int unit,
+ struct arglist *parsed
+);
+
+extern char *uml_vector_fetch_arg(
+ struct arglist *ifspec,
+ char *token
+);
+
+extern int uml_vector_recvmsg(int fd, void *hdr, int flags);
+extern int uml_vector_sendmsg(int fd, void *hdr, int flags);
+extern int uml_vector_writev(int fd, void *hdr, int iovcount);
+extern int uml_vector_sendmmsg(
+ int fd, void *msgvec,
+ unsigned int vlen,
+ unsigned int flags
+);
+extern int uml_vector_recvmmsg(
+ int fd,
+ void *msgvec,
+ unsigned int vlen,
+ unsigned int flags
+);
+extern void *uml_vector_default_bpf(int fd, void *mac);
+extern int uml_vector_attach_bpf(int fd, void *bpf, int bpf_len);
+extern bool uml_raw_enable_qdisc_bypass(int fd);
+extern bool uml_raw_enable_vnet_headers(int fd);
+extern bool uml_tap_enable_vnet_headers(int fd);
+
+
+#endif
--- /dev/null
+#include <asm-generic/asm-prototypes.h>
#define XTERM_IRQ 13
#define RANDOM_IRQ 14
+#ifdef CONFIG_UML_NET_VECTOR
+
+#define VECTOR_BASE_IRQ 15
+#define VECTOR_IRQ_SPACE 8
+
+#define LAST_IRQ (VECTOR_IRQ_SPACE + VECTOR_BASE_IRQ)
+
+#else
+
#define LAST_IRQ RANDOM_IRQ
+
+#endif
+
#define NR_IRQS (LAST_IRQ + 1)
#endif
#define __IRQ_USER_H__
#include <sysdep/ptrace.h>
+#include <stdbool.h>
struct irq_fd {
struct irq_fd *next;
int type;
int irq;
int events;
- int current_events;
+ bool active;
+ bool pending;
+ bool purge;
};
-enum { IRQ_READ, IRQ_WRITE };
+#define IRQ_READ 0
+#define IRQ_WRITE 1
+#define IRQ_NONE 2
+#define MAX_IRQ_TYPE (IRQ_NONE + 1)
+
+
struct siginfo;
extern void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs);
char **mac_out, char **gate_addr);
extern void register_transport(struct transport *new);
extern unsigned short eth_protocol(struct sk_buff *skb);
+extern void uml_net_setup_etheraddr(struct net_device *dev, char *str);
+
#endif
extern void reboot_skas(void);
/* irq.c */
-extern int os_waiting_for_events(struct irq_fd *active_fds);
-extern int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds);
-extern void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg,
- struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2);
-extern void os_free_irq_later(struct irq_fd *active_fds,
- int irq, void *dev_id);
-extern int os_get_pollfd(int i);
-extern void os_set_pollfd(int i, int fd);
+extern int os_waiting_for_events_epoll(void);
+extern void *os_epoll_get_data_pointer(int index);
+extern int os_epoll_triggered(int index, int events);
+extern int os_event_mask(int irq_type);
+extern int os_setup_epoll(void);
+extern int os_add_epoll_fd(int events, int fd, void *data);
+extern int os_mod_epoll_fd(int events, int fd, void *data);
+extern int os_del_epoll_fd(int fd);
extern void os_set_ioignore(void);
+extern void os_close_epoll_fd(void);
/* sigio.c */
extern int add_sigio_fd(int fd);
/*
+ * Copyright (C) 2017 - Cambridge Greys Ltd
+ * Copyright (C) 2011 - 2014 Cisco Systems Inc
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
* Derived (i.e. mostly copied) from arch/i386/kernel/irq.c:
#include <as-layout.h>
#include <kern_util.h>
#include <os.h>
+#include <irq_user.h>
-/*
- * This list is accessed under irq_lock, except in sigio_handler,
- * where it is safe from being modified. IRQ handlers won't change it -
- * if an IRQ source has vanished, it will be freed by free_irqs just
- * before returning from sigio_handler. That will process a separate
- * list of irqs to free, with its own locking, coming back here to
- * remove list elements, taking the irq_lock to do so.
+
+/* When epoll triggers we do not know why it did so
+ * we can also have different IRQs for read and write.
+ * This is why we keep a small irq_fd array for each fd -
+ * one entry per IRQ type
*/
-static struct irq_fd *active_fds = NULL;
-static struct irq_fd **last_irq_ptr = &active_fds;
-extern void free_irqs(void);
+struct irq_entry {
+ struct irq_entry *next;
+ int fd;
+ struct irq_fd *irq_array[MAX_IRQ_TYPE + 1];
+};
+
+static struct irq_entry *active_fds;
+
+static DEFINE_SPINLOCK(irq_lock);
+
+static void irq_io_loop(struct irq_fd *irq, struct uml_pt_regs *regs)
+{
+/*
+ * irq->active guards against reentry
+ * irq->pending accumulates pending requests
+ * if pending is raised the irq_handler is re-run
+ * until pending is cleared
+ */
+ if (irq->active) {
+ irq->active = false;
+ do {
+ irq->pending = false;
+ do_IRQ(irq->irq, regs);
+ } while (irq->pending && (!irq->purge));
+ if (!irq->purge)
+ irq->active = true;
+ } else {
+ irq->pending = true;
+ }
+}
void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
{
- struct irq_fd *irq_fd;
- int n;
+ struct irq_entry *irq_entry;
+ struct irq_fd *irq;
+
+ int n, i, j;
while (1) {
- n = os_waiting_for_events(active_fds);
+ /* This is now lockless - epoll keeps back-referencesto the irqs
+ * which have trigger it so there is no need to walk the irq
+ * list and lock it every time. We avoid locking by turning off
+ * IO for a specific fd by executing os_del_epoll_fd(fd) before
+ * we do any changes to the actual data structures
+ */
+ n = os_waiting_for_events_epoll();
+
if (n <= 0) {
if (n == -EINTR)
continue;
- else break;
+ else
+ break;
}
- for (irq_fd = active_fds; irq_fd != NULL;
- irq_fd = irq_fd->next) {
- if (irq_fd->current_events != 0) {
- irq_fd->current_events = 0;
- do_IRQ(irq_fd->irq, regs);
+ for (i = 0; i < n ; i++) {
+ /* Epoll back reference is the entry with 3 irq_fd
+ * leaves - one for each irq type.
+ */
+ irq_entry = (struct irq_entry *)
+ os_epoll_get_data_pointer(i);
+ for (j = 0; j < MAX_IRQ_TYPE ; j++) {
+ irq = irq_entry->irq_array[j];
+ if (irq == NULL)
+ continue;
+ if (os_epoll_triggered(i, irq->events) > 0)
+ irq_io_loop(irq, regs);
+ if (irq->purge) {
+ irq_entry->irq_array[j] = NULL;
+ kfree(irq);
+ }
}
}
}
+}
+
+static int assign_epoll_events_to_irq(struct irq_entry *irq_entry)
+{
+ int i;
+ int events = 0;
+ struct irq_fd *irq;
- free_irqs();
+ for (i = 0; i < MAX_IRQ_TYPE ; i++) {
+ irq = irq_entry->irq_array[i];
+ if (irq != NULL)
+ events = irq->events | events;
+ }
+ if (events > 0) {
+ /* os_add_epoll will call os_mod_epoll if this already exists */
+ return os_add_epoll_fd(events, irq_entry->fd, irq_entry);
+ }
+ /* No events - delete */
+ return os_del_epoll_fd(irq_entry->fd);
}
-static DEFINE_SPINLOCK(irq_lock);
+
static int activate_fd(int irq, int fd, int type, void *dev_id)
{
- struct pollfd *tmp_pfd;
- struct irq_fd *new_fd, *irq_fd;
+ struct irq_fd *new_fd;
+ struct irq_entry *irq_entry;
+ int i, err, events;
unsigned long flags;
- int events, err, n;
err = os_set_fd_async(fd);
if (err < 0)
goto out;
- err = -ENOMEM;
- new_fd = kmalloc(sizeof(struct irq_fd), GFP_KERNEL);
- if (new_fd == NULL)
- goto out;
+ spin_lock_irqsave(&irq_lock, flags);
- if (type == IRQ_READ)
- events = UM_POLLIN | UM_POLLPRI;
- else events = UM_POLLOUT;
- *new_fd = ((struct irq_fd) { .next = NULL,
- .id = dev_id,
- .fd = fd,
- .type = type,
- .irq = irq,
- .events = events,
- .current_events = 0 } );
+ /* Check if we have an entry for this fd */
err = -EBUSY;
- spin_lock_irqsave(&irq_lock, flags);
- for (irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next) {
- if ((irq_fd->fd == fd) && (irq_fd->type == type)) {
- printk(KERN_ERR "Registering fd %d twice\n", fd);
- printk(KERN_ERR "Irqs : %d, %d\n", irq_fd->irq, irq);
- printk(KERN_ERR "Ids : 0x%p, 0x%p\n", irq_fd->id,
- dev_id);
+ for (irq_entry = active_fds;
+ irq_entry != NULL; irq_entry = irq_entry->next) {
+ if (irq_entry->fd == fd)
+ break;
+ }
+
+ if (irq_entry == NULL) {
+ /* This needs to be atomic as it may be called from an
+ * IRQ context.
+ */
+ irq_entry = kmalloc(sizeof(struct irq_entry), GFP_ATOMIC);
+ if (irq_entry == NULL) {
+ printk(KERN_ERR
+ "Failed to allocate new IRQ entry\n");
goto out_unlock;
}
+ irq_entry->fd = fd;
+ for (i = 0; i < MAX_IRQ_TYPE; i++)
+ irq_entry->irq_array[i] = NULL;
+ irq_entry->next = active_fds;
+ active_fds = irq_entry;
}
- if (type == IRQ_WRITE)
- fd = -1;
-
- tmp_pfd = NULL;
- n = 0;
+ /* Check if we are trying to re-register an interrupt for a
+ * particular fd
+ */
- while (1) {
- n = os_create_pollfd(fd, events, tmp_pfd, n);
- if (n == 0)
- break;
+ if (irq_entry->irq_array[type] != NULL) {
+ printk(KERN_ERR
+ "Trying to reregister IRQ %d FD %d TYPE %d ID %p\n",
+ irq, fd, type, dev_id
+ );
+ goto out_unlock;
+ } else {
+ /* New entry for this fd */
+
+ err = -ENOMEM;
+ new_fd = kmalloc(sizeof(struct irq_fd), GFP_ATOMIC);
+ if (new_fd == NULL)
+ goto out_unlock;
- /*
- * n > 0
- * It means we couldn't put new pollfd to current pollfds
- * and tmp_fds is NULL or too small for new pollfds array.
- * Needed size is equal to n as minimum.
- *
- * Here we have to drop the lock in order to call
- * kmalloc, which might sleep.
- * If something else came in and changed the pollfds array
- * so we will not be able to put new pollfd struct to pollfds
- * then we free the buffer tmp_fds and try again.
+ events = os_event_mask(type);
+
+ *new_fd = ((struct irq_fd) {
+ .id = dev_id,
+ .irq = irq,
+ .type = type,
+ .events = events,
+ .active = true,
+ .pending = false,
+ .purge = false
+ });
+ /* Turn off any IO on this fd - allows us to
+ * avoid locking the IRQ loop
*/
- spin_unlock_irqrestore(&irq_lock, flags);
- kfree(tmp_pfd);
-
- tmp_pfd = kmalloc(n, GFP_KERNEL);
- if (tmp_pfd == NULL)
- goto out_kfree;
-
- spin_lock_irqsave(&irq_lock, flags);
+ os_del_epoll_fd(irq_entry->fd);
+ irq_entry->irq_array[type] = new_fd;
}
- *last_irq_ptr = new_fd;
- last_irq_ptr = &new_fd->next;
-
+ /* Turn back IO on with the correct (new) IO event mask */
+ assign_epoll_events_to_irq(irq_entry);
spin_unlock_irqrestore(&irq_lock, flags);
-
- /*
- * This calls activate_fd, so it has to be outside the critical
- * section.
- */
- maybe_sigio_broken(fd, (type == IRQ_READ));
+ maybe_sigio_broken(fd, (type != IRQ_NONE));
return 0;
-
- out_unlock:
+out_unlock:
spin_unlock_irqrestore(&irq_lock, flags);
- out_kfree:
- kfree(new_fd);
- out:
+out:
return err;
}
-static void free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg)
+/*
+ * Walk the IRQ list and dispose of any unused entries.
+ * Should be done under irq_lock.
+ */
+
+static void garbage_collect_irq_entries(void)
{
- unsigned long flags;
+ int i;
+ bool reap;
+ struct irq_entry *walk;
+ struct irq_entry *previous = NULL;
+ struct irq_entry *to_free;
- spin_lock_irqsave(&irq_lock, flags);
- os_free_irq_by_cb(test, arg, active_fds, &last_irq_ptr);
- spin_unlock_irqrestore(&irq_lock, flags);
+ if (active_fds == NULL)
+ return;
+ walk = active_fds;
+ while (walk != NULL) {