2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
18 * Joerg Roedel <jroedel@suse.de>
21 #define pr_fmt(fmt) "DMAR: " fmt
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/cpu.h>
37 #include <linux/timer.h>
39 #include <linux/iova.h>
40 #include <linux/iommu.h>
41 #include <linux/intel-iommu.h>
42 #include <linux/syscore_ops.h>
43 #include <linux/tboot.h>
44 #include <linux/dmi.h>
45 #include <linux/pci-ats.h>
46 #include <linux/memblock.h>
47 #include <linux/dma-contiguous.h>
48 #include <linux/crash_dump.h>
49 #include <asm/irq_remapping.h>
50 #include <asm/cacheflush.h>
51 #include <asm/iommu.h>
53 #include "irq_remapping.h"
55 #define ROOT_SIZE VTD_PAGE_SIZE
56 #define CONTEXT_SIZE VTD_PAGE_SIZE
58 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
59 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
60 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
61 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
63 #define IOAPIC_RANGE_START (0xfee00000)
64 #define IOAPIC_RANGE_END (0xfeefffff)
65 #define IOVA_START_ADDR (0x1000)
67 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
69 #define MAX_AGAW_WIDTH 64
70 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
72 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
73 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
75 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
76 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
77 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
78 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
79 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
81 /* IO virtual address start page frame number */
82 #define IOVA_START_PFN (1)
84 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
86 /* page table handling */
87 #define LEVEL_STRIDE (9)
88 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
91 * This bitmap is used to advertise the page sizes our hardware support
92 * to the IOMMU core, which will then use this information to split
93 * physically contiguous memory regions it is mapping into page sizes
96 * Traditionally the IOMMU core just handed us the mappings directly,
97 * after making sure the size is an order of a 4KiB page and that the
98 * mapping has natural alignment.
100 * To retain this behavior, we currently advertise that we support
101 * all page sizes that are an order of 4KiB.
103 * If at some point we'd like to utilize the IOMMU core's new behavior,
104 * we could change this to advertise the real page sizes we support.
106 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
108 static inline int agaw_to_level(int agaw)
113 static inline int agaw_to_width(int agaw)
115 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
118 static inline int width_to_agaw(int width)
120 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
123 static inline unsigned int level_to_offset_bits(int level)
125 return (level - 1) * LEVEL_STRIDE;
128 static inline int pfn_level_offset(unsigned long pfn, int level)
130 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
133 static inline unsigned long level_mask(int level)
135 return -1UL << level_to_offset_bits(level);
138 static inline unsigned long level_size(int level)
140 return 1UL << level_to_offset_bits(level);
143 static inline unsigned long align_to_level(unsigned long pfn, int level)
145 return (pfn + level_size(level) - 1) & level_mask(level);
148 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
150 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
153 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
154 are never going to work. */
155 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
157 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
162 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
164 static inline unsigned long page_to_dma_pfn(struct page *pg)
166 return mm_to_dma_pfn(page_to_pfn(pg));
168 static inline unsigned long virt_to_dma_pfn(void *p)
170 return page_to_dma_pfn(virt_to_page(p));
173 /* global iommu list, set NULL for ignored DMAR units */
174 static struct intel_iommu **g_iommus;
176 static void __init check_tylersburg_isoch(void);
177 static int rwbf_quirk;
180 * set to 1 to panic kernel if can't successfully enable VT-d
181 * (used when kernel is launched w/ TXT)
183 static int force_on = 0;
184 int intel_iommu_tboot_noforce;
189 * 12-63: Context Ptr (12 - (haw-1))
196 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
199 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
202 static phys_addr_t root_entry_lctp(struct root_entry *re)
207 return re->lo & VTD_PAGE_MASK;
211 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
214 static phys_addr_t root_entry_uctp(struct root_entry *re)
219 return re->hi & VTD_PAGE_MASK;
224 * 1: fault processing disable
225 * 2-3: translation type
226 * 12-63: address space root
232 struct context_entry {
237 static inline void context_clear_pasid_enable(struct context_entry *context)
239 context->lo &= ~(1ULL << 11);
242 static inline bool context_pasid_enabled(struct context_entry *context)
244 return !!(context->lo & (1ULL << 11));
247 static inline void context_set_copied(struct context_entry *context)
249 context->hi |= (1ull << 3);
252 static inline bool context_copied(struct context_entry *context)
254 return !!(context->hi & (1ULL << 3));
257 static inline bool __context_present(struct context_entry *context)
259 return (context->lo & 1);
262 static inline bool context_present(struct context_entry *context)
264 return context_pasid_enabled(context) ?
265 __context_present(context) :
266 __context_present(context) && !context_copied(context);
269 static inline void context_set_present(struct context_entry *context)
274 static inline void context_set_fault_enable(struct context_entry *context)
276 context->lo &= (((u64)-1) << 2) | 1;
279 static inline void context_set_translation_type(struct context_entry *context,
282 context->lo &= (((u64)-1) << 4) | 3;
283 context->lo |= (value & 3) << 2;
286 static inline void context_set_address_root(struct context_entry *context,
289 context->lo &= ~VTD_PAGE_MASK;
290 context->lo |= value & VTD_PAGE_MASK;
293 static inline void context_set_address_width(struct context_entry *context,
296 context->hi |= value & 7;
299 static inline void context_set_domain_id(struct context_entry *context,
302 context->hi |= (value & ((1 << 16) - 1)) << 8;
305 static inline int context_domain_id(struct context_entry *c)
307 return((c->hi >> 8) & 0xffff);
310 static inline void context_clear_entry(struct context_entry *context)
323 * 12-63: Host physcial address
329 static inline void dma_clear_pte(struct dma_pte *pte)
334 static inline u64 dma_pte_addr(struct dma_pte *pte)
337 return pte->val & VTD_PAGE_MASK;
339 /* Must have a full atomic 64-bit read */
340 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
344 static inline bool dma_pte_present(struct dma_pte *pte)
346 return (pte->val & 3) != 0;
349 static inline bool dma_pte_superpage(struct dma_pte *pte)
351 return (pte->val & DMA_PTE_LARGE_PAGE);
354 static inline int first_pte_in_page(struct dma_pte *pte)
356 return !((unsigned long)pte & ~VTD_PAGE_MASK);
360 * This domain is a statically identity mapping domain.
361 * 1. This domain creats a static 1:1 mapping to all usable memory.
362 * 2. It maps to each iommu if successful.
363 * 3. Each iommu mapps to this domain if successful.
365 static struct dmar_domain *si_domain;
366 static int hw_pass_through = 1;
369 * Domain represents a virtual machine, more than one devices
370 * across iommus may be owned in one domain, e.g. kvm guest.
372 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
374 /* si_domain contains mulitple devices */
375 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
377 #define for_each_domain_iommu(idx, domain) \
378 for (idx = 0; idx < g_num_of_iommus; idx++) \
379 if (domain->iommu_refcnt[idx])
382 int nid; /* node id */
384 unsigned iommu_refcnt[DMAR_UNITS_SUPPORTED];
385 /* Refcount of devices per iommu */
388 u16 iommu_did[DMAR_UNITS_SUPPORTED];
389 /* Domain ids per IOMMU. Use u16 since
390 * domain ids are 16 bit wide according
391 * to VT-d spec, section 9.3 */
393 bool has_iotlb_device;
394 struct list_head devices; /* all devices' list */
395 struct iova_domain iovad; /* iova's that belong to this domain */
397 struct dma_pte *pgd; /* virtual address */
398 int gaw; /* max guest address width */
400 /* adjusted guest address width, 0 is level 2 30-bit */
403 int flags; /* flags to find out type of domain */
405 int iommu_coherency;/* indicate coherency of iommu access */
406 int iommu_snooping; /* indicate snooping control feature*/
407 int iommu_count; /* reference count of iommu */
408 int iommu_superpage;/* Level of superpages supported:
409 0 == 4KiB (no superpages), 1 == 2MiB,
410 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
411 u64 max_addr; /* maximum mapped address */
413 struct iommu_domain domain; /* generic domain data structure for
417 /* PCI domain-device relationship */
418 struct device_domain_info {
419 struct list_head link; /* link to domain siblings */
420 struct list_head global; /* link to global list */
421 u8 bus; /* PCI bus number */
422 u8 devfn; /* PCI devfn number */
423 u8 pasid_supported:3;
430 struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
431 struct intel_iommu *iommu; /* IOMMU used by this device */
432 struct dmar_domain *domain; /* pointer to domain */
435 struct dmar_rmrr_unit {
436 struct list_head list; /* list of rmrr units */
437 struct acpi_dmar_header *hdr; /* ACPI header */
438 u64 base_address; /* reserved base address*/
439 u64 end_address; /* reserved end address */
440 struct dmar_dev_scope *devices; /* target devices */
441 int devices_cnt; /* target device count */
442 struct iommu_resv_region *resv; /* reserved region handle */
445 struct dmar_atsr_unit {
446 struct list_head list; /* list of ATSR units */
447 struct acpi_dmar_header *hdr; /* ACPI header */
448 struct dmar_dev_scope *devices; /* target devices */
449 int devices_cnt; /* target device count */
450 u8 include_all:1; /* include all ports */
453 static LIST_HEAD(dmar_atsr_units);
454 static LIST_HEAD(dmar_rmrr_units);
456 #define for_each_rmrr_units(rmrr) \
457 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
459 /* bitmap for indexing intel_iommus */
460 static int g_num_of_iommus;
462 static void domain_exit(struct dmar_domain *domain);
463 static void domain_remove_dev_info(struct dmar_domain *domain);
464 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
466 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
467 static void domain_context_clear(struct intel_iommu *iommu,
469 static int domain_detach_iommu(struct dmar_domain *domain,
470 struct intel_iommu *iommu);
472 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
473 int dmar_disabled = 0;
475 int dmar_disabled = 1;
476 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
478 int intel_iommu_enabled = 0;
479 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
481 static int dmar_map_gfx = 1;
482 static int dmar_forcedac;
483 static int intel_iommu_strict;
484 static int intel_iommu_superpage = 1;
485 static int intel_iommu_ecs = 1;
486 static int intel_iommu_pasid28;
487 static int iommu_identity_mapping;
489 #define IDENTMAP_ALL 1
490 #define IDENTMAP_GFX 2
491 #define IDENTMAP_AZALIA 4
493 /* Broadwell and Skylake have broken ECS support — normal so-called "second
494 * level" translation of DMA requests-without-PASID doesn't actually happen
495 * unless you also set the NESTE bit in an extended context-entry. Which of
496 * course means that SVM doesn't work because it's trying to do nested
497 * translation of the physical addresses it finds in the process page tables,
498 * through the IOVA->phys mapping found in the "second level" page tables.
500 * The VT-d specification was retroactively changed to change the definition
501 * of the capability bits and pretend that Broadwell/Skylake never happened...
502 * but unfortunately the wrong bit was changed. It's ECS which is broken, but
503 * for some reason it was the PASID capability bit which was redefined (from
504 * bit 28 on BDW/SKL to bit 40 in future).
506 * So our test for ECS needs to eschew those implementations which set the old
507 * PASID capabiity bit 28, since those are the ones on which ECS is broken.
508 * Unless we are working around the 'pasid28' limitations, that is, by putting
509 * the device into passthrough mode for normal DMA and thus masking the bug.
511 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
512 (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
513 /* PASID support is thus enabled if ECS is enabled and *either* of the old
514 * or new capability bits are set. */
515 #define pasid_enabled(iommu) (ecs_enabled(iommu) && \
516 (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
518 int intel_iommu_gfx_mapped;
519 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
521 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
522 static DEFINE_SPINLOCK(device_domain_lock);
523 static LIST_HEAD(device_domain_list);
525 const struct iommu_ops intel_iommu_ops;
527 static bool translation_pre_enabled(struct intel_iommu *iommu)
529 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
532 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
534 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
537 static void init_translation_status(struct intel_iommu *iommu)
541 gsts = readl(iommu->reg + DMAR_GSTS_REG);
542 if (gsts & DMA_GSTS_TES)
543 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
546 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
547 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
549 return container_of(dom, struct dmar_domain, domain);
552 static int __init intel_iommu_setup(char *str)
557 if (!strncmp(str, "on", 2)) {
559 pr_info("IOMMU enabled\n");
560 } else if (!strncmp(str, "off", 3)) {
562 pr_info("IOMMU disabled\n");
563 } else if (!strncmp(str, "igfx_off", 8)) {
565 pr_info("Disable GFX device mapping\n");
566 } else if (!strncmp(str, "forcedac", 8)) {
567 pr_info("Forcing DAC for PCI devices\n");
569 } else if (!strncmp(str, "strict", 6)) {
570 pr_info("Disable batched IOTLB flush\n");
571 intel_iommu_strict = 1;
572 } else if (!strncmp(str, "sp_off", 6)) {
573 pr_info("Disable supported super page\n");
574 intel_iommu_superpage = 0;
575 } else if (!strncmp(str, "ecs_off", 7)) {
577 "Intel-IOMMU: disable extended context table support\n");
579 } else if (!strncmp(str, "pasid28", 7)) {
581 "Intel-IOMMU: enable pre-production PASID support\n");
582 intel_iommu_pasid28 = 1;
583 iommu_identity_mapping |= IDENTMAP_GFX;
584 } else if (!strncmp(str, "tboot_noforce", 13)) {
586 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
587 intel_iommu_tboot_noforce = 1;
590 str += strcspn(str, ",");
596 __setup("intel_iommu=", intel_iommu_setup);
598 static struct kmem_cache *iommu_domain_cache;
599 static struct kmem_cache *iommu_devinfo_cache;
601 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
603 struct dmar_domain **domains;
606 domains = iommu->domains[idx];
610 return domains[did & 0xff];
613 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
614 struct dmar_domain *domain)
616 struct dmar_domain **domains;
619 if (!iommu->domains[idx]) {
620 size_t size = 256 * sizeof(struct dmar_domain *);
621 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
624 domains = iommu->domains[idx];
625 if (WARN_ON(!domains))
628 domains[did & 0xff] = domain;
631 static inline void *alloc_pgtable_page(int node)
636 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
638 vaddr = page_address(page);
642 static inline void free_pgtable_page(void *vaddr)
644 free_page((unsigned long)vaddr);
647 static inline void *alloc_domain_mem(void)
649 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
652 static void free_domain_mem(void *vaddr)
654 kmem_cache_free(iommu_domain_cache, vaddr);
657 static inline void * alloc_devinfo_mem(void)
659 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
662 static inline void free_devinfo_mem(void *vaddr)
664 kmem_cache_free(iommu_devinfo_cache, vaddr);
667 static inline int domain_type_is_vm(struct dmar_domain *domain)
669 return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
672 static inline int domain_type_is_si(struct dmar_domain *domain)
674 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
677 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
679 return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
680 DOMAIN_FLAG_STATIC_IDENTITY);
683 static inline int domain_pfn_supported(struct dmar_domain *domain,
686 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
688 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
691 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
696 sagaw = cap_sagaw(iommu->cap);
697 for (agaw = width_to_agaw(max_gaw);
699 if (test_bit(agaw, &sagaw))
707 * Calculate max SAGAW for each iommu.
709 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
711 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
715 * calculate agaw for each iommu.
716 * "SAGAW" may be different across iommus, use a default agaw, and
717 * get a supported less agaw for iommus that don't support the default agaw.
719 int iommu_calculate_agaw(struct intel_iommu *iommu)
721 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
724 /* This functionin only returns single iommu in a domain */
725 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
729 /* si_domain and vm domain should not get here. */
730 BUG_ON(domain_type_is_vm_or_si(domain));
731 for_each_domain_iommu(iommu_id, domain)
734 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
737 return g_iommus[iommu_id];
740 static void domain_update_iommu_coherency(struct dmar_domain *domain)
742 struct dmar_drhd_unit *drhd;
743 struct intel_iommu *iommu;
747 domain->iommu_coherency = 1;
749 for_each_domain_iommu(i, domain) {
751 if (!ecap_coherent(g_iommus[i]->ecap)) {
752 domain->iommu_coherency = 0;
759 /* No hardware attached; use lowest common denominator */
761 for_each_active_iommu(iommu, drhd) {
762 if (!ecap_coherent(iommu->ecap)) {
763 domain->iommu_coherency = 0;
770 static int domain_update_iommu_snooping(struct intel_iommu *skip)
772 struct dmar_drhd_unit *drhd;
773 struct intel_iommu *iommu;
777 for_each_active_iommu(iommu, drhd) {
779 if (!ecap_sc_support(iommu->ecap)) {
790 static int domain_update_iommu_superpage(struct intel_iommu *skip)
792 struct dmar_drhd_unit *drhd;
793 struct intel_iommu *iommu;
796 if (!intel_iommu_superpage) {
800 /* set iommu_superpage to the smallest common denominator */
802 for_each_active_iommu(iommu, drhd) {
804 mask &= cap_super_page_val(iommu->cap);
814 /* Some capabilities may be different across iommus */
815 static void domain_update_iommu_cap(struct dmar_domain *domain)
817 domain_update_iommu_coherency(domain);
818 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
819 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
822 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
823 u8 bus, u8 devfn, int alloc)
825 struct root_entry *root = &iommu->root_entry[bus];
826 struct context_entry *context;
830 if (ecs_enabled(iommu)) {
838 context = phys_to_virt(*entry & VTD_PAGE_MASK);
840 unsigned long phy_addr;
844 context = alloc_pgtable_page(iommu->node);
848 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
849 phy_addr = virt_to_phys((void *)context);
850 *entry = phy_addr | 1;
851 __iommu_flush_cache(iommu, entry, sizeof(*entry));
853 return &context[devfn];
856 static int iommu_dummy(struct device *dev)
858 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
861 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
863 struct dmar_drhd_unit *drhd = NULL;
864 struct intel_iommu *iommu;
866 struct pci_dev *ptmp, *pdev = NULL;
870 if (iommu_dummy(dev))
873 if (dev_is_pci(dev)) {
874 struct pci_dev *pf_pdev;
876 pdev = to_pci_dev(dev);
879 /* VMD child devices currently cannot be handled individually */
880 if (is_vmd(pdev->bus))
884 /* VFs aren't listed in scope tables; we need to look up
885 * the PF instead to find the IOMMU. */
886 pf_pdev = pci_physfn(pdev);
888 segment = pci_domain_nr(pdev->bus);
889 } else if (has_acpi_companion(dev))
890 dev = &ACPI_COMPANION(dev)->dev;
893 for_each_active_iommu(iommu, drhd) {
894 if (pdev && segment != drhd->segment)
897 for_each_active_dev_scope(drhd->devices,
898 drhd->devices_cnt, i, tmp) {
900 /* For a VF use its original BDF# not that of the PF
901 * which we used for the IOMMU lookup. Strictly speaking
902 * we could do this for all PCI devices; we only need to
903 * get the BDF# from the scope table for ACPI matches. */
904 if (pdev && pdev->is_virtfn)
907 *bus = drhd->devices[i].bus;
908 *devfn = drhd->devices[i].devfn;
912 if (!pdev || !dev_is_pci(tmp))
915 ptmp = to_pci_dev(tmp);
916 if (ptmp->subordinate &&
917 ptmp->subordinate->number <= pdev->bus->number &&
918 ptmp->subordinate->busn_res.end >= pdev->bus->number)
922 if (pdev && drhd->include_all) {
924 *bus = pdev->bus->number;
925 *devfn = pdev->devfn;
936 static void domain_flush_cache(struct dmar_domain *domain,
937 void *addr, int size)
939 if (!domain->iommu_coherency)
940 clflush_cache_range(addr, size);
943 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
945 struct context_entry *context;
949 spin_lock_irqsave(&iommu->lock, flags);
950 context = iommu_context_addr(iommu, bus, devfn, 0);
952 ret = context_present(context);
953 spin_unlock_irqrestore(&iommu->lock, flags);
957 static void free_context_table(struct intel_iommu *iommu)
961 struct context_entry *context;
963 spin_lock_irqsave(&iommu->lock, flags);
964 if (!iommu->root_entry) {
967 for (i = 0; i < ROOT_ENTRY_NR; i++) {
968 context = iommu_context_addr(iommu, i, 0, 0);
970 free_pgtable_page(context);
972 if (!ecs_enabled(iommu))
975 context = iommu_context_addr(iommu, i, 0x80, 0);
977 free_pgtable_page(context);
980 free_pgtable_page(iommu->root_entry);
981 iommu->root_entry = NULL;
983 spin_unlock_irqrestore(&iommu->lock, flags);
986 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
987 unsigned long pfn, int *target_level)
989 struct dma_pte *parent, *pte = NULL;
990 int level = agaw_to_level(domain->agaw);
993 BUG_ON(!domain->pgd);
995 if (!domain_pfn_supported(domain, pfn))
996 /* Address beyond IOMMU's addressing capabilities. */
999 parent = domain->pgd;
1004 offset = pfn_level_offset(pfn, level);
1005 pte = &parent[offset];
1006 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1008 if (level == *target_level)
1011 if (!dma_pte_present(pte)) {
1014 tmp_page = alloc_pgtable_page(domain->nid);
1019 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1020 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1021 if (cmpxchg64(&pte->val, 0ULL, pteval))
1022 /* Someone else set it while we were thinking; use theirs. */
1023 free_pgtable_page(tmp_page);
1025 domain_flush_cache(domain, pte, sizeof(*pte));
1030 parent = phys_to_virt(dma_pte_addr(pte));
1035 *target_level = level;
1041 /* return address's pte at specific level */
1042 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1044 int level, int *large_page)
1046 struct dma_pte *parent, *pte = NULL;
1047 int total = agaw_to_level(domain->agaw);
1050 parent = domain->pgd;
1051 while (level <= total) {
1052 offset = pfn_level_offset(pfn, total);
1053 pte = &parent[offset];
1057 if (!dma_pte_present(pte)) {
1058 *large_page = total;
1062 if (dma_pte_superpage(pte)) {
1063 *large_page = total;
1067 parent = phys_to_virt(dma_pte_addr(pte));
1073 /* clear last level pte, a tlb flush should be followed */
1074 static void dma_pte_clear_range(struct dmar_domain *domain,
1075 unsigned long start_pfn,
1076 unsigned long last_pfn)
1078 unsigned int large_page = 1;
1079 struct dma_pte *first_pte, *pte;
1081 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1082 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1083 BUG_ON(start_pfn > last_pfn);
1085 /* we don't need lock here; nobody else touches the iova range */
1088 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1090 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1095 start_pfn += lvl_to_nr_pages(large_page);
1097 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1099 domain_flush_cache(domain, first_pte,
1100 (void *)pte - (void *)first_pte);
1102 } while (start_pfn && start_pfn <= last_pfn);
1105 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1106 int retain_level, struct dma_pte *pte,
1107 unsigned long pfn, unsigned long start_pfn,
1108 unsigned long last_pfn)
1110 pfn = max(start_pfn, pfn);
1111 pte = &pte[pfn_level_offset(pfn, level)];
1114 unsigned long level_pfn;
1115 struct dma_pte *level_pte;
1117 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1120 level_pfn = pfn & level_mask(level);
1121 level_pte = phys_to_virt(dma_pte_addr(pte));
1124 dma_pte_free_level(domain, level - 1, retain_level,
1125 level_pte, level_pfn, start_pfn,
1130 * Free the page table if we're below the level we want to
1131 * retain and the range covers the entire table.
1133 if (level < retain_level && !(start_pfn > level_pfn ||
1134 last_pfn < level_pfn + level_size(level) - 1)) {
1136 domain_flush_cache(domain, pte, sizeof(*pte));
1137 free_pgtable_page(level_pte);
1140 pfn += level_size(level);
1141 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1145 * clear last level (leaf) ptes and free page table pages below the
1146 * level we wish to keep intact.
1148 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1149 unsigned long start_pfn,
1150 unsigned long last_pfn,
1153 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1154 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1155 BUG_ON(start_pfn > last_pfn);
1157 dma_pte_clear_range(domain, start_pfn, last_pfn);
1159 /* We don't need lock here; nobody else touches the iova range */
1160 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1161 domain->pgd, 0, start_pfn, last_pfn);
1164 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1165 free_pgtable_page(domain->pgd);
1170 /* When a page at a given level is being unlinked from its parent, we don't
1171 need to *modify* it at all. All we need to do is make a list of all the
1172 pages which can be freed just as soon as we've flushed the IOTLB and we
1173 know the hardware page-walk will no longer touch them.
1174 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1176 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1177 int level, struct dma_pte *pte,
1178 struct page *freelist)
1182 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1183 pg->freelist = freelist;
1189 pte = page_address(pg);
1191 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1192 freelist = dma_pte_list_pagetables(domain, level - 1,
1195 } while (!first_pte_in_page(pte));
1200 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1201 struct dma_pte *pte, unsigned long pfn,
1202 unsigned long start_pfn,
1203 unsigned long last_pfn,
1204 struct page *freelist)
1206 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1208 pfn = max(start_pfn, pfn);
1209 pte = &pte[pfn_level_offset(pfn, level)];
1212 unsigned long level_pfn;
1214 if (!dma_pte_present(pte))
1217 level_pfn = pfn & level_mask(level);
1219 /* If range covers entire pagetable, free it */
1220 if (start_pfn <= level_pfn &&
1221 last_pfn >= level_pfn + level_size(level) - 1) {
1222 /* These suborbinate page tables are going away entirely. Don't
1223 bother to clear them; we're just going to *free* them. */
1224 if (level > 1 && !dma_pte_superpage(pte))
1225 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1231 } else if (level > 1) {
1232 /* Recurse down into a level that isn't *entirely* obsolete */
1233 freelist = dma_pte_clear_level(domain, level - 1,
1234 phys_to_virt(dma_pte_addr(pte)),
1235 level_pfn, start_pfn, last_pfn,
1239 pfn += level_size(level);
1240 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1243 domain_flush_cache(domain, first_pte,
1244 (void *)++last_pte - (void *)first_pte);
1249 /* We can't just free the pages because the IOMMU may still be walking
1250 the page tables, and may have cached the intermediate levels. The
1251 pages can only be freed after the IOTLB flush has been done. */
1252 static struct page *domain_unmap(struct dmar_domain *domain,
1253 unsigned long start_pfn,
1254 unsigned long last_pfn)
1256 struct page *freelist = NULL;
1258 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1259 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1260 BUG_ON(start_pfn > last_pfn);
1262 /* we don't need lock here; nobody else touches the iova range */
1263 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1264 domain->pgd, 0, start_pfn, last_pfn, NULL);
1267 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1268 struct page *pgd_page = virt_to_page(domain->pgd);
1269 pgd_page->freelist = freelist;
1270 freelist = pgd_page;
1278 static void dma_free_pagelist(struct page *freelist)
1282 while ((pg = freelist)) {
1283 freelist = pg->freelist;
1284 free_pgtable_page(page_address(pg));
1288 static void iova_entry_free(unsigned long data)
1290 struct page *freelist = (struct page *)data;
1292 dma_free_pagelist(freelist);
1295 /* iommu handling */
1296 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1298 struct root_entry *root;
1299 unsigned long flags;
1301 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1303 pr_err("Allocating root entry for %s failed\n",
1308 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1310 spin_lock_irqsave(&iommu->lock, flags);
1311 iommu->root_entry = root;
1312 spin_unlock_irqrestore(&iommu->lock, flags);
1317 static void iommu_set_root_entry(struct intel_iommu *iommu)
1323 addr = virt_to_phys(iommu->root_entry);
1324 if (ecs_enabled(iommu))
1325 addr |= DMA_RTADDR_RTT;
1327 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1328 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1330 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1332 /* Make sure hardware complete it */
1333 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1334 readl, (sts & DMA_GSTS_RTPS), sts);
1336 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1339 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1344 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1347 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1348 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1350 /* Make sure hardware complete it */
1351 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1352 readl, (!(val & DMA_GSTS_WBFS)), val);
1354 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1357 /* return value determine if we need a write buffer flush */
1358 static void __iommu_flush_context(struct intel_iommu *iommu,
1359 u16 did, u16 source_id, u8 function_mask,
1366 case DMA_CCMD_GLOBAL_INVL:
1367 val = DMA_CCMD_GLOBAL_INVL;
1369 case DMA_CCMD_DOMAIN_INVL:
1370 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1372 case DMA_CCMD_DEVICE_INVL:
1373 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1374 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1379 val |= DMA_CCMD_ICC;
1381 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1382 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1384 /* Make sure hardware complete it */
1385 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1386 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1388 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1391 /* return value determine if we need a write buffer flush */
1392 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1393 u64 addr, unsigned int size_order, u64 type)
1395 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1396 u64 val = 0, val_iva = 0;
1400 case DMA_TLB_GLOBAL_FLUSH:
1401 /* global flush doesn't need set IVA_REG */
1402 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1404 case DMA_TLB_DSI_FLUSH:
1405 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1407 case DMA_TLB_PSI_FLUSH:
1408 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1409 /* IH bit is passed in as part of address */
1410 val_iva = size_order | addr;
1415 /* Note: set drain read/write */
1418 * This is probably to be super secure.. Looks like we can
1419 * ignore it without any impact.
1421 if (cap_read_drain(iommu->cap))
1422 val |= DMA_TLB_READ_DRAIN;
1424 if (cap_write_drain(iommu->cap))
1425 val |= DMA_TLB_WRITE_DRAIN;
1427 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1428 /* Note: Only uses first TLB reg currently */
1430 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1431 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1433 /* Make sure hardware complete it */
1434 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1435 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1437 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1439 /* check IOTLB invalidation granularity */
1440 if (DMA_TLB_IAIG(val) == 0)
1441 pr_err("Flush IOTLB failed\n");
1442 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1443 pr_debug("TLB flush request %Lx, actual %Lx\n",
1444 (unsigned long long)DMA_TLB_IIRG(type),
1445 (unsigned long long)DMA_TLB_IAIG(val));
1448 static struct device_domain_info *
1449 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1452 struct device_domain_info *info;
1454 assert_spin_locked(&device_domain_lock);
1459 list_for_each_entry(info, &domain->devices, link)
1460 if (info->iommu == iommu && info->bus == bus &&
1461 info->devfn == devfn) {
1462 if (info->ats_supported && info->dev)
1470 static void domain_update_iotlb(struct dmar_domain *domain)
1472 struct device_domain_info *info;
1473 bool has_iotlb_device = false;
1475 assert_spin_locked(&device_domain_lock);
1477 list_for_each_entry(info, &domain->devices, link) {
1478 struct pci_dev *pdev;
1480 if (!info->dev || !dev_is_pci(info->dev))
1483 pdev = to_pci_dev(info->dev);
1484 if (pdev->ats_enabled) {
1485 has_iotlb_device = true;
1490 domain->has_iotlb_device = has_iotlb_device;
1493 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1495 struct pci_dev *pdev;
1497 assert_spin_locked(&device_domain_lock);
1499 if (!info || !dev_is_pci(info->dev))
1502 pdev = to_pci_dev(info->dev);
1504 #ifdef CONFIG_INTEL_IOMMU_SVM
1505 /* The PCIe spec, in its wisdom, declares that the behaviour of
1506 the device if you enable PASID support after ATS support is
1507 undefined. So always enable PASID support on devices which
1508 have it, even if we can't yet know if we're ever going to
1510 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1511 info->pasid_enabled = 1;
1513 if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1514 info->pri_enabled = 1;
1516 if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1517 info->ats_enabled = 1;
1518 domain_update_iotlb(info->domain);
1519 info->ats_qdep = pci_ats_queue_depth(pdev);
1523 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1525 struct pci_dev *pdev;
1527 assert_spin_locked(&device_domain_lock);
1529 if (!dev_is_pci(info->dev))
1532 pdev = to_pci_dev(info->dev);
1534 if (info->ats_enabled) {
1535 pci_disable_ats(pdev);
1536 info->ats_enabled = 0;
1537 domain_update_iotlb(info->domain);
1539 #ifdef CONFIG_INTEL_IOMMU_SVM
1540 if (info->pri_enabled) {
1541 pci_disable_pri(pdev);
1542 info->pri_enabled = 0;
1544 if (info->pasid_enabled) {
1545 pci_disable_pasid(pdev);
1546 info->pasid_enabled = 0;
1551 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1552 u64 addr, unsigned mask)
1555 unsigned long flags;
1556 struct device_domain_info *info;
1558 if (!domain->has_iotlb_device)
1561 spin_lock_irqsave(&device_domain_lock, flags);
1562 list_for_each_entry(info, &domain->devices, link) {
1563 if (!info->ats_enabled)
1566 sid = info->bus << 8 | info->devfn;
1567 qdep = info->ats_qdep;
1568 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1570 spin_unlock_irqrestore(&device_domain_lock, flags);
1573 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1574 struct dmar_domain *domain,
1575 unsigned long pfn, unsigned int pages,
1578 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1579 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1580 u16 did = domain->iommu_did[iommu->seq_id];
1587 * Fallback to domain selective flush if no PSI support or the size is
1589 * PSI requires page size to be 2 ^ x, and the base address is naturally
1590 * aligned to the size
1592 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1593 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1596 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1600 * In caching mode, changes of pages from non-present to present require
1601 * flush. However, device IOTLB doesn't need to be flushed in this case.
1603 if (!cap_caching_mode(iommu->cap) || !map)
1604 iommu_flush_dev_iotlb(domain, addr, mask);
1607 static void iommu_flush_iova(struct iova_domain *iovad)
1609 struct dmar_domain *domain;
1612 domain = container_of(iovad, struct dmar_domain, iovad);
1614 for_each_domain_iommu(idx, domain) {
1615 struct intel_iommu *iommu = g_iommus[idx];
1616 u16 did = domain->iommu_did[iommu->seq_id];
1618 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1620 if (!cap_caching_mode(iommu->cap))
1621 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1622 0, MAX_AGAW_PFN_WIDTH);
1626 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1629 unsigned long flags;
1631 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1632 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1633 pmen &= ~DMA_PMEN_EPM;
1634 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1636 /* wait for the protected region status bit to clear */
1637 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1638 readl, !(pmen & DMA_PMEN_PRS), pmen);
1640 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1643 static void iommu_enable_translation(struct intel_iommu *iommu)
1646 unsigned long flags;
1648 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1649 iommu->gcmd |= DMA_GCMD_TE;
1650 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1652 /* Make sure hardware complete it */
1653 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1654 readl, (sts & DMA_GSTS_TES), sts);
1656 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1659 static void iommu_disable_translation(struct intel_iommu *iommu)
1664 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1665 iommu->gcmd &= ~DMA_GCMD_TE;
1666 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1668 /* Make sure hardware complete it */
1669 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1670 readl, (!(sts & DMA_GSTS_TES)), sts);
1672 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1676 static int iommu_init_domains(struct intel_iommu *iommu)
1678 u32 ndomains, nlongs;
1681 ndomains = cap_ndoms(iommu->cap);
1682 pr_debug("%s: Number of Domains supported <%d>\n",
1683 iommu->name, ndomains);
1684 nlongs = BITS_TO_LONGS(ndomains);
1686 spin_lock_init(&iommu->lock);
1688 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1689 if (!iommu->domain_ids) {
1690 pr_err("%s: Allocating domain id array failed\n",
1695 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1696 iommu->domains = kzalloc(size, GFP_KERNEL);
1698 if (iommu->domains) {
1699 size = 256 * sizeof(struct dmar_domain *);
1700 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1703 if (!iommu->domains || !iommu->domains[0]) {
1704 pr_err("%s: Allocating domain array failed\n",
1706 kfree(iommu->domain_ids);
1707 kfree(iommu->domains);
1708 iommu->domain_ids = NULL;
1709 iommu->domains = NULL;
1716 * If Caching mode is set, then invalid translations are tagged
1717 * with domain-id 0, hence we need to pre-allocate it. We also
1718 * use domain-id 0 as a marker for non-allocated domain-id, so
1719 * make sure it is not used for a real domain.
1721 set_bit(0, iommu->domain_ids);
1726 static void disable_dmar_iommu(struct intel_iommu *iommu)
1728 struct device_domain_info *info, *tmp;
1729 unsigned long flags;
1731 if (!iommu->domains || !iommu->domain_ids)
1735 spin_lock_irqsave(&device_domain_lock, flags);
1736 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1737 struct dmar_domain *domain;
1739 if (info->iommu != iommu)
1742 if (!info->dev || !info->domain)
1745 domain = info->domain;
1747 __dmar_remove_one_dev_info(info);
1749 if (!domain_type_is_vm_or_si(domain)) {
1751 * The domain_exit() function can't be called under
1752 * device_domain_lock, as it takes this lock itself.
1753 * So release the lock here and re-run the loop
1756 spin_unlock_irqrestore(&device_domain_lock, flags);
1757 domain_exit(domain);
1761 spin_unlock_irqrestore(&device_domain_lock, flags);
1763 if (iommu->gcmd & DMA_GCMD_TE)
1764 iommu_disable_translation(iommu);
1767 static void free_dmar_iommu(struct intel_iommu *iommu)
1769 if ((iommu->domains) && (iommu->domain_ids)) {
1770 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1773 for (i = 0; i < elems; i++)
1774 kfree(iommu->domains[i]);
1775 kfree(iommu->domains);
1776 kfree(iommu->domain_ids);
1777 iommu->domains = NULL;
1778 iommu->domain_ids = NULL;
1781 g_iommus[iommu->seq_id] = NULL;
1783 /* free context mapping */
1784 free_context_table(iommu);
1786 #ifdef CONFIG_INTEL_IOMMU_SVM
1787 if (pasid_enabled(iommu)) {
1788 if (ecap_prs(iommu->ecap))
1789 intel_svm_finish_prq(iommu);
1790 intel_svm_free_pasid_tables(iommu);
1795 static struct dmar_domain *alloc_domain(int flags)
1797 struct dmar_domain *domain;
1799 domain = alloc_domain_mem();
1803 memset(domain, 0, sizeof(*domain));
1805 domain->flags = flags;
1806 domain->has_iotlb_device = false;
1807 INIT_LIST_HEAD(&domain->devices);
1812 /* Must be called with iommu->lock */
1813 static int domain_attach_iommu(struct dmar_domain *domain,
1814 struct intel_iommu *iommu)
1816 unsigned long ndomains;
1819 assert_spin_locked(&device_domain_lock);
1820 assert_spin_locked(&iommu->lock);
1822 domain->iommu_refcnt[iommu->seq_id] += 1;
1823 domain->iommu_count += 1;
1824 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1825 ndomains = cap_ndoms(iommu->cap);
1826 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1828 if (num >= ndomains) {
1829 pr_err("%s: No free domain ids\n", iommu->name);
1830 domain->iommu_refcnt[iommu->seq_id] -= 1;
1831 domain->iommu_count -= 1;
1835 set_bit(num, iommu->domain_ids);
1836 set_iommu_domain(iommu, num, domain);
1838 domain->iommu_did[iommu->seq_id] = num;
1839 domain->nid = iommu->node;
1841 domain_update_iommu_cap(domain);
1847 static int domain_detach_iommu(struct dmar_domain *domain,
1848 struct intel_iommu *iommu)
1850 int num, count = INT_MAX;
1852 assert_spin_locked(&device_domain_lock);
1853 assert_spin_locked(&iommu->lock);
1855 domain->iommu_refcnt[iommu->seq_id] -= 1;
1856 count = --domain->iommu_count;
1857 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1858 num = domain->iommu_did[iommu->seq_id];
1859 clear_bit(num, iommu->domain_ids);
1860 set_iommu_domain(iommu, num, NULL);
1862 domain_update_iommu_cap(domain);
1863 domain->iommu_did[iommu->seq_id] = 0;
1869 static struct iova_domain reserved_iova_list;
1870 static struct lock_class_key reserved_rbtree_key;
1872 static int dmar_init_reserved_ranges(void)
1874 struct pci_dev *pdev = NULL;
1878 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1880 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1881 &reserved_rbtree_key);
1883 /* IOAPIC ranges shouldn't be accessed by DMA */
1884 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1885 IOVA_PFN(IOAPIC_RANGE_END));
1887 pr_err("Reserve IOAPIC range failed\n");
1891 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1892 for_each_pci_dev(pdev) {
1895 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1896 r = &pdev->resource[i];
1897 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1899 iova = reserve_iova(&reserved_iova_list,
1903 pr_err("Reserve iova failed\n");
1911 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1913 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1916 static inline int guestwidth_to_adjustwidth(int gaw)
1919 int r = (gaw - 12) % 9;
1930 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1933 int adjust_width, agaw;
1934 unsigned long sagaw;
1937 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1939 err = init_iova_flush_queue(&domain->iovad,
1940 iommu_flush_iova, iova_entry_free);
1944 domain_reserve_special_ranges(domain);
1946 /* calculate AGAW */
1947 if (guest_width > cap_mgaw(iommu->cap))
1948 guest_width = cap_mgaw(iommu->cap);
1949 domain->gaw = guest_width;
1950 adjust_width = guestwidth_to_adjustwidth(guest_width);
1951 agaw = width_to_agaw(adjust_width);
1952 sagaw = cap_sagaw(iommu->cap);
1953 if (!test_bit(agaw, &sagaw)) {
1954 /* hardware doesn't support it, choose a bigger one */
1955 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1956 agaw = find_next_bit(&sagaw, 5, agaw);
1960 domain->agaw = agaw;
1962 if (ecap_coherent(iommu->ecap))
1963 domain->iommu_coherency = 1;
1965 domain->iommu_coherency = 0;
1967 if (ecap_sc_support(iommu->ecap))
1968 domain->iommu_snooping = 1;
1970 domain->iommu_snooping = 0;
1972 if (intel_iommu_superpage)
1973 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1975 domain->iommu_superpage = 0;
1977 domain->nid = iommu->node;
1979 /* always allocate the top pgd */
1980 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1983 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1987 static void domain_exit(struct dmar_domain *domain)
1989 struct page *freelist = NULL;
1991 /* Domain 0 is reserved, so dont process it */
1995 /* Remove associated devices and clear attached or cached domains */
1997 domain_remove_dev_info(domain);
2001 put_iova_domain(&domain->iovad);
2003 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2005 dma_free_pagelist(freelist);
2007 free_domain_mem(domain);
2010 static int domain_context_mapping_one(struct dmar_domain *domain,
2011 struct intel_iommu *iommu,
2014 u16 did = domain->iommu_did[iommu->seq_id];
2015 int translation = CONTEXT_TT_MULTI_LEVEL;
2016 struct device_domain_info *info = NULL;
2017 struct context_entry *context;
2018 unsigned long flags;
2019 struct dma_pte *pgd;
2024 if (hw_pass_through && domain_type_is_si(domain))
2025 translation = CONTEXT_TT_PASS_THROUGH;
2027 pr_debug("Set context mapping for %02x:%02x.%d\n",
2028 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2030 BUG_ON(!domain->pgd);
2032 spin_lock_irqsave(&device_domain_lock, flags);
2033 spin_lock(&iommu->lock);
2036 context = iommu_context_addr(iommu, bus, devfn, 1);
2041 if (context_present(context))
2045 * For kdump cases, old valid entries may be cached due to the
2046 * in-flight DMA and copied pgtable, but there is no unmapping
2047 * behaviour for them, thus we need an explicit cache flush for
2048 * the newly-mapped device. For kdump, at this point, the device
2049 * is supposed to finish reset at its driver probe stage, so no
2050 * in-flight DMA will exist, and we don't need to worry anymore
2053 if (context_copied(context)) {
2054 u16 did_old = context_domain_id(context);
2056 if (did_old < cap_ndoms(iommu->cap)) {
2057 iommu->flush.flush_context(iommu, did_old,
2058 (((u16)bus) << 8) | devfn,
2059 DMA_CCMD_MASK_NOBIT,
2060 DMA_CCMD_DEVICE_INVL);
2061 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2068 context_clear_entry(context);
2069 context_set_domain_id(context, did);
2072 * Skip top levels of page tables for iommu which has less agaw
2073 * than default. Unnecessary for PT mode.
2075 if (translation != CONTEXT_TT_PASS_THROUGH) {
2076 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2078 pgd = phys_to_virt(dma_pte_addr(pgd));
2079 if (!dma_pte_present(pgd))
2083 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2084 if (info && info->ats_supported)
2085 translation = CONTEXT_TT_DEV_IOTLB;
2087 translation = CONTEXT_TT_MULTI_LEVEL;
2089 context_set_address_root(context, virt_to_phys(pgd));
2090 context_set_address_width(context, iommu->agaw);
2093 * In pass through mode, AW must be programmed to
2094 * indicate the largest AGAW value supported by
2095 * hardware. And ASR is ignored by hardware.
2097 context_set_address_width(context, iommu->msagaw);
2100 context_set_translation_type(context, translation);
2101 context_set_fault_enable(context);
2102 context_set_present(context);
2103 domain_flush_cache(domain, context, sizeof(*context));
2106 * It's a non-present to present mapping. If hardware doesn't cache
2107 * non-present entry we only need to flush the write-buffer. If the
2108 * _does_ cache non-present entries, then it does so in the special
2109 * domain #0, which we have to flush:
2111 if (cap_caching_mode(iommu->cap)) {
2112 iommu->flush.flush_context(iommu, 0,
2113 (((u16)bus) << 8) | devfn,
2114 DMA_CCMD_MASK_NOBIT,
2115 DMA_CCMD_DEVICE_INVL);
2116 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2118 iommu_flush_write_buffer(iommu);
2120 iommu_enable_dev_iotlb(info);
2125 spin_unlock(&iommu->lock);
2126 spin_unlock_irqrestore(&device_domain_lock, flags);
2131 struct domain_context_mapping_data {
2132 struct dmar_domain *domain;
2133 struct intel_iommu *iommu;
2136 static int domain_context_mapping_cb(struct pci_dev *pdev,
2137 u16 alias, void *opaque)
2139 struct domain_context_mapping_data *data = opaque;
2141 return domain_context_mapping_one(data->domain, data->iommu,
2142 PCI_BUS_NUM(alias), alias & 0xff);
2146 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2148 struct intel_iommu *iommu;
2150 struct domain_context_mapping_data data;
2152 iommu = device_to_iommu(dev, &bus, &devfn);
2156 if (!dev_is_pci(dev))
2157 return domain_context_mapping_one(domain, iommu, bus, devfn);
2159 data.domain = domain;
2162 return pci_for_each_dma_alias(to_pci_dev(dev),
2163 &domain_context_mapping_cb, &data);
2166 static int domain_context_mapped_cb(struct pci_dev *pdev,
2167 u16 alias, void *opaque)
2169 struct intel_iommu *iommu = opaque;
2171 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2174 static int domain_context_mapped(struct device *dev)
2176 struct intel_iommu *iommu;
2179 iommu = device_to_iommu(dev, &bus, &devfn);
2183 if (!dev_is_pci(dev))
2184 return device_context_mapped(iommu, bus, devfn);
2186 return !pci_for_each_dma_alias(to_pci_dev(dev),
2187 domain_context_mapped_cb, iommu);
2190 /* Returns a number of VTD pages, but aligned to MM page size */
2191 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2194 host_addr &= ~PAGE_MASK;
2195 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2198 /* Return largest possible superpage level for a given mapping */
2199 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2200 unsigned long iov_pfn,
2201 unsigned long phy_pfn,
2202 unsigned long pages)
2204 int support, level = 1;
2205 unsigned long pfnmerge;
2207 support = domain->iommu_superpage;
2209 /* To use a large page, the virtual *and* physical addresses
2210 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2211 of them will mean we have to use smaller pages. So just
2212 merge them and check both at once. */
2213 pfnmerge = iov_pfn | phy_pfn;
2215 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2216 pages >>= VTD_STRIDE_SHIFT;
2219 pfnmerge >>= VTD_STRIDE_SHIFT;
2226 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2227 struct scatterlist *sg, unsigned long phys_pfn,
2228 unsigned long nr_pages, int prot)
2230 struct dma_pte *first_pte = NULL, *pte = NULL;
2231 phys_addr_t uninitialized_var(pteval);
2232 unsigned long sg_res = 0;
2233 unsigned int largepage_lvl = 0;
2234 unsigned long lvl_pages = 0;
2236 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2238 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2241 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2245 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2248 while (nr_pages > 0) {
2252 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2254 sg_res = aligned_nrpages(sg->offset, sg->length);
2255 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2256 sg->dma_length = sg->length;
2257 pteval = (sg_phys(sg) - pgoff) | prot;
2258 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2262 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2264 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2267 /* It is large page*/
2268 if (largepage_lvl > 1) {
2269 unsigned long nr_superpages, end_pfn;
2271 pteval |= DMA_PTE_LARGE_PAGE;
2272 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2274 nr_superpages = sg_res / lvl_pages;
2275 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2278 * Ensure that old small page tables are
2279 * removed to make room for superpage(s).
2280 * We're adding new large pages, so make sure
2281 * we don't remove their parent tables.
2283 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2286 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2290 /* We don't need lock here, nobody else
2291 * touches the iova range
2293 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2295 static int dumps = 5;
2296 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2297 iov_pfn, tmp, (unsigned long long)pteval);
2300 debug_dma_dump_mappings(NULL);
2305 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2307 BUG_ON(nr_pages < lvl_pages);
2308 BUG_ON(sg_res < lvl_pages);
2310 nr_pages -= lvl_pages;
2311 iov_pfn += lvl_pages;
2312 phys_pfn += lvl_pages;
2313 pteval += lvl_pages * VTD_PAGE_SIZE;
2314 sg_res -= lvl_pages;
2316 /* If the next PTE would be the first in a new page, then we
2317 need to flush the cache on the entries we've just written.
2318 And then we'll need to recalculate 'pte', so clear it and
2319 let it get set again in the if (!pte) block above.
2321 If we're done (!nr_pages) we need to flush the cache too.
2323 Also if we've been setting superpages, we may need to
2324 recalculate 'pte' and switch back to smaller pages for the
2325 end of the mapping, if the trailing size is not enough to
2326 use another superpage (i.e. sg_res < lvl_pages). */
2328 if (!nr_pages || first_pte_in_page(pte) ||
2329 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2330 domain_flush_cache(domain, first_pte,
2331 (void *)pte - (void *)first_pte);
2335 if (!sg_res && nr_pages)
2341 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2342 struct scatterlist *sg, unsigned long nr_pages,
2345 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2348 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2349 unsigned long phys_pfn, unsigned long nr_pages,
2352 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2355 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2357 unsigned long flags;
2358 struct context_entry *context;
2364 spin_lock_irqsave(&iommu->lock, flags);
2365 context = iommu_context_addr(iommu, bus, devfn, 0);
2367 spin_unlock_irqrestore(&iommu->lock, flags);
2370 did_old = context_domain_id(context);
2371 context_clear_entry(context);
2372 __iommu_flush_cache(iommu, context, sizeof(*context));
2373 spin_unlock_irqrestore(&iommu->lock, flags);
2374 iommu->flush.flush_context(iommu,
2376 (((u16)bus) << 8) | devfn,
2377 DMA_CCMD_MASK_NOBIT,
2378 DMA_CCMD_DEVICE_INVL);
2379 iommu->flush.flush_iotlb(iommu,
2386 static inline void unlink_domain_info(struct device_domain_info *info)
2388 assert_spin_locked(&device_domain_lock);
2389 list_del(&info->link);
2390 list_del(&info->global);
2392 info->dev->archdata.iommu = NULL;
2395 static void domain_remove_dev_info(struct dmar_domain *domain)
2397 struct device_domain_info *info, *tmp;
2398 unsigned long flags;
2400 spin_lock_irqsave(&device_domain_lock, flags);
2401 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2402 __dmar_remove_one_dev_info(info);
2403 spin_unlock_irqrestore(&device_domain_lock, flags);
2408 * Note: we use struct device->archdata.iommu stores the info
2410 static struct dmar_domain *find_domain(struct device *dev)
2412 struct device_domain_info *info;
2414 /* No lock here, assumes no domain exit in normal case */
2415 info = dev->archdata.iommu;
2417 return info->domain;
2421 static inline struct device_domain_info *
2422 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2424 struct device_domain_info *info;
2426 list_for_each_entry(info, &device_domain_list, global)
2427 if (info->iommu->segment == segment && info->bus == bus &&
2428 info->devfn == devfn)
2434 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2437 struct dmar_domain *domain)
2439 struct dmar_domain *found = NULL;
2440 struct device_domain_info *info;
2441 unsigned long flags;
2444 info = alloc_devinfo_mem();
2449 info->devfn = devfn;
2450 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2451 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2454 info->domain = domain;
2455 info->iommu = iommu;
2457 if (dev && dev_is_pci(dev)) {
2458 struct pci_dev *pdev = to_pci_dev(info->dev);
2460 if (ecap_dev_iotlb_support(iommu->ecap) &&
2461 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2462 dmar_find_matched_atsr_unit(pdev))
2463 info->ats_supported = 1;
2465 if (ecs_enabled(iommu)) {
2466 if (pasid_enabled(iommu)) {
2467 int features = pci_pasid_features(pdev);
2469 info->pasid_supported = features | 1;
2472 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2473 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2474 info->pri_supported = 1;
2478 spin_lock_irqsave(&device_domain_lock, flags);
2480 found = find_domain(dev);
2483 struct device_domain_info *info2;
2484 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2486 found = info2->domain;
2492 spin_unlock_irqrestore(&device_domain_lock, flags);
2493 free_devinfo_mem(info);
2494 /* Caller must free the original domain */
2498 spin_lock(&iommu->lock);
2499 ret = domain_attach_iommu(domain, iommu);
2500 spin_unlock(&iommu->lock);
2503 spin_unlock_irqrestore(&device_domain_lock, flags);
2504 free_devinfo_mem(info);
2508 list_add(&info->link, &domain->devices);
2509 list_add(&info->global, &device_domain_list);
2511 dev->archdata.iommu = info;
2512 spin_unlock_irqrestore(&device_domain_lock, flags);
2514 if (dev && domain_context_mapping(domain, dev)) {
2515 pr_err("Domain context map for %s failed\n", dev_name(dev));
2516 dmar_remove_one_dev_info(domain, dev);
2523 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2525 *(u16 *)opaque = alias;
2529 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2531 struct device_domain_info *info = NULL;
2532 struct dmar_domain *domain = NULL;
2533 struct intel_iommu *iommu;
2534 u16 req_id, dma_alias;
2535 unsigned long flags;
2538 iommu = device_to_iommu(dev, &bus, &devfn);
2542 req_id = ((u16)bus << 8) | devfn;
2544 if (dev_is_pci(dev)) {
2545 struct pci_dev *pdev = to_pci_dev(dev);
2547 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2549 spin_lock_irqsave(&device_domain_lock, flags);
2550 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2551 PCI_BUS_NUM(dma_alias),
2554 iommu = info->iommu;
2555 domain = info->domain;
2557 spin_unlock_irqrestore(&device_domain_lock, flags);
2559 /* DMA alias already has a domain, use it */
2564 /* Allocate and initialize new domain for the device */
2565 domain = alloc_domain(0);
2568 if (domain_init(domain, iommu, gaw)) {
2569 domain_exit(domain);
2578 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2579 struct dmar_domain *domain)
2581 struct intel_iommu *iommu;
2582 struct dmar_domain *tmp;
2583 u16 req_id, dma_alias;
2586 iommu = device_to_iommu(dev, &bus, &devfn);
2590 req_id = ((u16)bus << 8) | devfn;
2592 if (dev_is_pci(dev)) {
2593 struct pci_dev *pdev = to_pci_dev(dev);
2595 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2597 /* register PCI DMA alias device */
2598 if (req_id != dma_alias) {
2599 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2600 dma_alias & 0xff, NULL, domain);
2602 if (!tmp || tmp != domain)
2607 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2608 if (!tmp || tmp != domain)
2614 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2616 struct dmar_domain *domain, *tmp;
2618 domain = find_domain(dev);
2622 domain = find_or_alloc_domain(dev, gaw);
2626 tmp = set_domain_for_dev(dev, domain);
2627 if (!tmp || domain != tmp) {
2628 domain_exit(domain);
2637 static int iommu_domain_identity_map(struct dmar_domain *domain,
2638 unsigned long long start,
2639 unsigned long long end)
2641 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2642 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2644 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2645 dma_to_mm_pfn(last_vpfn))) {
2646 pr_err("Reserving iova failed\n");
2650 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2652 * RMRR range might have overlap with physical memory range,
2655 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2657 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2658 last_vpfn - first_vpfn + 1,
2659 DMA_PTE_READ|DMA_PTE_WRITE);
2662 static int domain_prepare_identity_map(struct device *dev,
2663 struct dmar_domain *domain,
2664 unsigned long long start,
2665 unsigned long long end)
2667 /* For _hardware_ passthrough, don't bother. But for software
2668 passthrough, we do it anyway -- it may indicate a memory
2669 range which is reserved in E820, so which didn't get set
2670 up to start with in si_domain */
2671 if (domain == si_domain && hw_pass_through) {
2672 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2673 dev_name(dev), start, end);
2677 pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2678 dev_name(dev), start, end);
2681 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2682 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2683 dmi_get_system_info(DMI_BIOS_VENDOR),
2684 dmi_get_system_info(DMI_BIOS_VERSION),
2685 dmi_get_system_info(DMI_PRODUCT_VERSION));
2689 if (end >> agaw_to_width(domain->agaw)) {
2690 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2691 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2692 agaw_to_width(domain->agaw),
2693 dmi_get_system_info(DMI_BIOS_VENDOR),
2694 dmi_get_system_info(DMI_BIOS_VERSION),
2695 dmi_get_system_info(DMI_PRODUCT_VERSION));
2699 return iommu_domain_identity_map(domain, start, end);
2702 static int iommu_prepare_identity_map(struct device *dev,
2703 unsigned long long start,
2704 unsigned long long end)
2706 struct dmar_domain *domain;
2709 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2713 ret = domain_prepare_identity_map(dev, domain, start, end);
2715 domain_exit(domain);
2720 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2723 if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2725 return iommu_prepare_identity_map(dev, rmrr->base_address,
2729 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2730 static inline void iommu_prepare_isa(void)
2732 struct pci_dev *pdev;
2735 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2739 pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2740 ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2743 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2748 static inline void iommu_prepare_isa(void)
2752 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2754 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2756 static int __init si_domain_init(int hw)
2760 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2764 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2765 domain_exit(si_domain);
2769 pr_debug("Identity mapping domain allocated\n");
2774 for_each_online_node(nid) {
2775 unsigned long start_pfn, end_pfn;
2778 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2779 ret = iommu_domain_identity_map(si_domain,
2780 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2789 static int identity_mapping(struct device *dev)
2791 struct device_domain_info *info;
2793 if (likely(!iommu_identity_mapping))
2796 info = dev->archdata.iommu;
2797 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2798 return (info->domain == si_domain);
2803 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2805 struct dmar_domain *ndomain;
2806 struct intel_iommu *iommu;
2809 iommu = device_to_iommu(dev, &bus, &devfn);
2813 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2814 if (ndomain != domain)
2820 static bool device_has_rmrr(struct device *dev)
2822 struct dmar_rmrr_unit *rmrr;
2827 for_each_rmrr_units(rmrr) {
2829 * Return TRUE if this RMRR contains the device that
2832 for_each_active_dev_scope(rmrr->devices,
2833 rmrr->devices_cnt, i, tmp)
2844 * There are a couple cases where we need to restrict the functionality of
2845 * devices associated with RMRRs. The first is when evaluating a device for
2846 * identity mapping because problems exist when devices are moved in and out
2847 * of domains and their respective RMRR information is lost. This means that
2848 * a device with associated RMRRs will never be in a "passthrough" domain.
2849 * The second is use of the device through the IOMMU API. This interface
2850 * expects to have full control of the IOVA space for the device. We cannot
2851 * satisfy both the requirement that RMRR access is maintained and have an
2852 * unencumbered IOVA space. We also have no ability to quiesce the device's
2853 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2854 * We therefore prevent devices associated with an RMRR from participating in
2855 * the IOMMU API, which eliminates them from device assignment.
2857 * In both cases we assume that PCI USB devices with RMRRs have them largely
2858 * for historical reasons and that the RMRR space is not actively used post
2859 * boot. This exclusion may change if vendors begin to abuse it.
2861 * The same exception is made for graphics devices, with the requirement that
2862 * any use of the RMRR regions will be torn down before assigning the device
2865 static bool device_is_rmrr_locked(struct device *dev)
2867 if (!device_has_rmrr(dev))
2870 if (dev_is_pci(dev)) {
2871 struct pci_dev *pdev = to_pci_dev(dev);
2873 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2880 static int iommu_should_identity_map(struct device *dev, int startup)
2883 if (dev_is_pci(dev)) {
2884 struct pci_dev *pdev = to_pci_dev(dev);
2886 if (device_is_rmrr_locked(dev))
2889 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2892 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2895 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2899 * We want to start off with all devices in the 1:1 domain, and
2900 * take them out later if we find they can't access all of memory.
2902 * However, we can't do this for PCI devices behind bridges,
2903 * because all PCI devices behind the same bridge will end up
2904 * with the same source-id on their transactions.
2906 * Practically speaking, we can't change things around for these
2907 * devices at run-time, because we can't be sure there'll be no
2908 * DMA transactions in flight for any of their siblings.
2910 * So PCI devices (unless they're on the root bus) as well as
2911 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2912 * the 1:1 domain, just in _case_ one of their siblings turns out
2913 * not to be able to map all of memory.
2915 if (!pci_is_pcie(pdev)) {
2916 if (!pci_is_root_bus(pdev->bus))
2918 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2920 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2923 if (device_has_rmrr(dev))
2928 * At boot time, we don't yet know if devices will be 64-bit capable.
2929 * Assume that they will — if they turn out not to be, then we can
2930 * take them out of the 1:1 domain later.
2934 * If the device's dma_mask is less than the system's memory
2935 * size then this is not a candidate for identity mapping.
2937 u64 dma_mask = *dev->dma_mask;
2939 if (dev->coherent_dma_mask &&
2940 dev->coherent_dma_mask < dma_mask)
2941 dma_mask = dev->coherent_dma_mask;
2943 return dma_mask >= dma_get_required_mask(dev);
2949 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2953 if (!iommu_should_identity_map(dev, 1))
2956 ret = domain_add_dev_info(si_domain, dev);
2958 pr_info("%s identity mapping for device %s\n",
2959 hw ? "Hardware" : "Software", dev_name(dev));
2960 else if (ret == -ENODEV)
2961 /* device not associated with an iommu */
2968 static int __init iommu_prepare_static_identity_mapping(int hw)
2970 struct pci_dev *pdev = NULL;
2971 struct dmar_drhd_unit *drhd;
2972 struct intel_iommu *iommu;
2977 for_each_pci_dev(pdev) {
2978 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2983 for_each_active_iommu(iommu, drhd)
2984 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2985 struct acpi_device_physical_node *pn;
2986 struct acpi_device *adev;
2988 if (dev->bus != &acpi_bus_type)
2991 adev= to_acpi_device(dev);
2992 mutex_lock(&adev->physical_node_lock);
2993 list_for_each_entry(pn, &adev->physical_node_list, node) {
2994 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2998 mutex_unlock(&adev->physical_node_lock);
3006 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3009 * Start from the sane iommu hardware state.
3010 * If the queued invalidation is already initialized by us
3011 * (for example, while enabling interrupt-remapping) then
3012 * we got the things already rolling from a sane state.
3016 * Clear any previous faults.
3018 dmar_fault(-1, iommu);
3020 * Disable queued invalidation if supported and already enabled
3021 * before OS handover.
3023 dmar_disable_qi(iommu);
3026 if (dmar_enable_qi(iommu)) {
3028 * Queued Invalidate not enabled, use Register Based Invalidate
3030 iommu->flush.flush_context = __iommu_flush_context;
3031 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3032 pr_info("%s: Using Register based invalidation\n",
3035 iommu->flush.flush_context = qi_flush_context;
3036 iommu->flush.flush_iotlb = qi_flush_iotlb;
3037 pr_info("%s: Using Queued invalidation\n", iommu->name);
3041 static int copy_context_table(struct intel_iommu *iommu,
3042 struct root_entry *old_re,
3043 struct context_entry **tbl,
3046 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3047 struct context_entry *new_ce = NULL, ce;
3048 struct context_entry *old_ce = NULL;
3049 struct root_entry re;
3050 phys_addr_t old_ce_phys;
3052 tbl_idx = ext ? bus * 2 : bus;
3053 memcpy(&re, old_re, sizeof(re));
3055 for (devfn = 0; devfn < 256; devfn++) {
3056 /* First calculate the correct index */
3057 idx = (ext ? devfn * 2 : devfn) % 256;
3060 /* First save what we may have and clean up */
3062 tbl[tbl_idx] = new_ce;
3063 __iommu_flush_cache(iommu, new_ce,
3073 old_ce_phys = root_entry_lctp(&re);
3075 old_ce_phys = root_entry_uctp(&re);
3078 if (ext && devfn == 0) {
3079 /* No LCTP, try UCTP */
3088 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3093 new_ce = alloc_pgtable_page(iommu->node);
3100 /* Now copy the context entry */
3101 memcpy(&ce, old_ce + idx, sizeof(ce));
3103 if (!__context_present(&ce))
3106 did = context_domain_id(&ce);
3107 if (did >= 0 && did < cap_ndoms(iommu->cap))
3108 set_bit(did, iommu->domain_ids);
3111 * We need a marker for copied context entries. This
3112 * marker needs to work for the old format as well as
3113 * for extended context entries.
3115 * Bit 67 of the context entry is used. In the old
3116 * format this bit is available to software, in the
3117 * extended format it is the PGE bit, but PGE is ignored
3118 * by HW if PASIDs are disabled (and thus still
3121 * So disable PASIDs first and then mark the entry
3122 * copied. This means that we don't copy PASID
3123 * translations from the old kernel, but this is fine as
3124 * faults there are not fatal.
3126 context_clear_pasid_enable(&ce);
3127 context_set_copied(&ce);
3132 tbl[tbl_idx + pos] = new_ce;
3134 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3143 static int copy_translation_tables(struct intel_iommu *iommu)
3145 struct context_entry **ctxt_tbls;
3146 struct root_entry *old_rt;
3147 phys_addr_t old_rt_phys;
3148 int ctxt_table_entries;
3149 unsigned long flags;
3154 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3155 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3156 new_ext = !!ecap_ecs(iommu->ecap);
3159 * The RTT bit can only be changed when translation is disabled,
3160 * but disabling translation means to open a window for data
3161 * corruption. So bail out and don't copy anything if we would
3162 * have to change the bit.
3167 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3171 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3175 /* This is too big for the stack - allocate it from slab */
3176 ctxt_table_entries = ext ? 512 : 256;
3178 ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
3182 for (bus = 0; bus < 256; bus++) {
3183 ret = copy_context_table(iommu, &old_rt[bus],
3184 ctxt_tbls, bus, ext);
3186 pr_err("%s: Failed to copy context table for bus %d\n",
3192 spin_lock_irqsave(&iommu->lock, flags);
3194 /* Context tables are copied, now write them to the root_entry table */
3195 for (bus = 0; bus < 256; bus++) {
3196 int idx = ext ? bus * 2 : bus;
3199 if (ctxt_tbls[idx]) {
3200 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3201 iommu->root_entry[bus].lo = val;
3204 if (!ext || !ctxt_tbls[idx + 1])
3207 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3208 iommu->root_entry[bus].hi = val;
3211 spin_unlock_irqrestore(&iommu->lock, flags);
3215 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3225 static int __init init_dmars(void)
3227 struct dmar_drhd_unit *drhd;
3228 struct dmar_rmrr_unit *rmrr;
3229 bool copied_tables = false;
3231 struct intel_iommu *iommu;
3237 * initialize and program root entry to not present
3240 for_each_drhd_unit(drhd) {
3242 * lock not needed as this is only incremented in the single
3243 * threaded kernel __init code path all other access are read
3246 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3250 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3253 /* Preallocate enough resources for IOMMU hot-addition */
3254 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3255 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3257 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3260 pr_err("Allocating global iommu array failed\n");
3265 for_each_active_iommu(iommu, drhd) {
3266 g_iommus[iommu->seq_id] = iommu;
3268 intel_iommu_init_qi(iommu);
3270 ret = iommu_init_domains(iommu);
3274 init_translation_status(iommu);
3276 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3277 iommu_disable_translation(iommu);
3278 clear_translation_pre_enabled(iommu);
3279 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3285 * we could share the same root & context tables
3286 * among all IOMMU's. Need to Split it later.
3288 ret = iommu_alloc_root_entry(iommu);
3292 if (translation_pre_enabled(iommu)) {
3293 pr_info("Translation already enabled - trying to copy translation structures\n");
3295 ret = copy_translation_tables(iommu);
3298 * We found the IOMMU with translation
3299 * enabled - but failed to copy over the
3300 * old root-entry table. Try to proceed
3301 * by disabling translation now and
3302 * allocating a clean root-entry table.
3303 * This might cause DMAR faults, but
3304 * probably the dump will still succeed.
3306 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3308 iommu_disable_translation(iommu);
3309 clear_translation_pre_enabled(iommu);
3311 pr_info("Copied translation tables from previous kernel for %s\n",
3313 copied_tables = true;
3317 if (!ecap_pass_through(iommu->ecap))
3318 hw_pass_through = 0;
3319 #ifdef CONFIG_INTEL_IOMMU_SVM
3320 if (pasid_enabled(iommu))
3321 intel_svm_alloc_pasid_tables(iommu);
3326 * Now that qi is enabled on all iommus, set the root entry and flush
3327 * caches. This is required on some Intel X58 chipsets, otherwise the
3328 * flush_context function will loop forever and the boot hangs.
3330 for_each_active_iommu(iommu, drhd) {
3331 iommu_flush_write_buffer(iommu);
3332 iommu_set_root_entry(iommu);
3333 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3334 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3337 if (iommu_pass_through)
3338 iommu_identity_mapping |= IDENTMAP_ALL;
3340 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3341 iommu_identity_mapping |= IDENTMAP_GFX;
3344 check_tylersburg_isoch();
3346 if (iommu_identity_mapping) {
3347 ret = si_domain_init(hw_pass_through);
3354 * If we copied translations from a previous kernel in the kdump
3355 * case, we can not assign the devices to domains now, as that
3356 * would eliminate the old mappings. So skip this part and defer
3357 * the assignment to device driver initialization time.
3363 * If pass through is not set or not enabled, setup context entries for
3364 * identity mappings for rmrr, gfx, and isa and may fall back to static
3365 * identity mapping if iommu_identity_mapping is set.
3367 if (iommu_identity_mapping) {
3368 ret = iommu_prepare_static_identity_mapping(hw_pass_through);