f002d47d2f27a95438def0aa4d2c471e9ced9e4b
[muen/linux.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22 #define dev_fmt(fmt)    pr_fmt(fmt)
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/memory.h>
37 #include <linux/cpu.h>
38 #include <linux/timer.h>
39 #include <linux/io.h>
40 #include <linux/iova.h>
41 #include <linux/iommu.h>
42 #include <linux/intel-iommu.h>
43 #include <linux/syscore_ops.h>
44 #include <linux/tboot.h>
45 #include <linux/dmi.h>
46 #include <linux/pci-ats.h>
47 #include <linux/memblock.h>
48 #include <linux/dma-contiguous.h>
49 #include <linux/dma-direct.h>
50 #include <linux/crash_dump.h>
51 #include <linux/numa.h>
52 #include <asm/irq_remapping.h>
53 #include <asm/cacheflush.h>
54 #include <asm/iommu.h>
55
56 #include "irq_remapping.h"
57 #include "intel-pasid.h"
58
59 #define ROOT_SIZE               VTD_PAGE_SIZE
60 #define CONTEXT_SIZE            VTD_PAGE_SIZE
61
62 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
63 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
64 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
65 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
66
67 #define IOAPIC_RANGE_START      (0xfee00000)
68 #define IOAPIC_RANGE_END        (0xfeefffff)
69 #define IOVA_START_ADDR         (0x1000)
70
71 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
72
73 #define MAX_AGAW_WIDTH 64
74 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
75
76 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
77 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
78
79 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
80    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
81 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
82                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
83 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
84
85 /* IO virtual address start page frame number */
86 #define IOVA_START_PFN          (1)
87
88 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
89
90 /* page table handling */
91 #define LEVEL_STRIDE            (9)
92 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
93
94 /*
95  * This bitmap is used to advertise the page sizes our hardware support
96  * to the IOMMU core, which will then use this information to split
97  * physically contiguous memory regions it is mapping into page sizes
98  * that we support.
99  *
100  * Traditionally the IOMMU core just handed us the mappings directly,
101  * after making sure the size is an order of a 4KiB page and that the
102  * mapping has natural alignment.
103  *
104  * To retain this behavior, we currently advertise that we support
105  * all page sizes that are an order of 4KiB.
106  *
107  * If at some point we'd like to utilize the IOMMU core's new behavior,
108  * we could change this to advertise the real page sizes we support.
109  */
110 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
111
112 static inline int agaw_to_level(int agaw)
113 {
114         return agaw + 2;
115 }
116
117 static inline int agaw_to_width(int agaw)
118 {
119         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
120 }
121
122 static inline int width_to_agaw(int width)
123 {
124         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
125 }
126
127 static inline unsigned int level_to_offset_bits(int level)
128 {
129         return (level - 1) * LEVEL_STRIDE;
130 }
131
132 static inline int pfn_level_offset(unsigned long pfn, int level)
133 {
134         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
135 }
136
137 static inline unsigned long level_mask(int level)
138 {
139         return -1UL << level_to_offset_bits(level);
140 }
141
142 static inline unsigned long level_size(int level)
143 {
144         return 1UL << level_to_offset_bits(level);
145 }
146
147 static inline unsigned long align_to_level(unsigned long pfn, int level)
148 {
149         return (pfn + level_size(level) - 1) & level_mask(level);
150 }
151
152 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
153 {
154         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
155 }
156
157 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
158    are never going to work. */
159 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
160 {
161         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 }
163
164 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
165 {
166         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
167 }
168 static inline unsigned long page_to_dma_pfn(struct page *pg)
169 {
170         return mm_to_dma_pfn(page_to_pfn(pg));
171 }
172 static inline unsigned long virt_to_dma_pfn(void *p)
173 {
174         return page_to_dma_pfn(virt_to_page(p));
175 }
176
177 /* global iommu list, set NULL for ignored DMAR units */
178 static struct intel_iommu **g_iommus;
179
180 static void __init check_tylersburg_isoch(void);
181 static int rwbf_quirk;
182
183 /*
184  * set to 1 to panic kernel if can't successfully enable VT-d
185  * (used when kernel is launched w/ TXT)
186  */
187 static int force_on = 0;
188 int intel_iommu_tboot_noforce;
189 static int no_platform_optin;
190
191 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
192
193 /*
194  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
195  * if marked present.
196  */
197 static phys_addr_t root_entry_lctp(struct root_entry *re)
198 {
199         if (!(re->lo & 1))
200                 return 0;
201
202         return re->lo & VTD_PAGE_MASK;
203 }
204
205 /*
206  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
207  * if marked present.
208  */
209 static phys_addr_t root_entry_uctp(struct root_entry *re)
210 {
211         if (!(re->hi & 1))
212                 return 0;
213
214         return re->hi & VTD_PAGE_MASK;
215 }
216
217 static inline void context_clear_pasid_enable(struct context_entry *context)
218 {
219         context->lo &= ~(1ULL << 11);
220 }
221
222 static inline bool context_pasid_enabled(struct context_entry *context)
223 {
224         return !!(context->lo & (1ULL << 11));
225 }
226
227 static inline void context_set_copied(struct context_entry *context)
228 {
229         context->hi |= (1ull << 3);
230 }
231
232 static inline bool context_copied(struct context_entry *context)
233 {
234         return !!(context->hi & (1ULL << 3));
235 }
236
237 static inline bool __context_present(struct context_entry *context)
238 {
239         return (context->lo & 1);
240 }
241
242 bool context_present(struct context_entry *context)
243 {
244         return context_pasid_enabled(context) ?
245              __context_present(context) :
246              __context_present(context) && !context_copied(context);
247 }
248
249 static inline void context_set_present(struct context_entry *context)
250 {
251         context->lo |= 1;
252 }
253
254 static inline void context_set_fault_enable(struct context_entry *context)
255 {
256         context->lo &= (((u64)-1) << 2) | 1;
257 }
258
259 static inline void context_set_translation_type(struct context_entry *context,
260                                                 unsigned long value)
261 {
262         context->lo &= (((u64)-1) << 4) | 3;
263         context->lo |= (value & 3) << 2;
264 }
265
266 static inline void context_set_address_root(struct context_entry *context,
267                                             unsigned long value)
268 {
269         context->lo &= ~VTD_PAGE_MASK;
270         context->lo |= value & VTD_PAGE_MASK;
271 }
272
273 static inline void context_set_address_width(struct context_entry *context,
274                                              unsigned long value)
275 {
276         context->hi |= value & 7;
277 }
278
279 static inline void context_set_domain_id(struct context_entry *context,
280                                          unsigned long value)
281 {
282         context->hi |= (value & ((1 << 16) - 1)) << 8;
283 }
284
285 static inline int context_domain_id(struct context_entry *c)
286 {
287         return((c->hi >> 8) & 0xffff);
288 }
289
290 static inline void context_clear_entry(struct context_entry *context)
291 {
292         context->lo = 0;
293         context->hi = 0;
294 }
295
296 /*
297  * This domain is a statically identity mapping domain.
298  *      1. This domain creats a static 1:1 mapping to all usable memory.
299  *      2. It maps to each iommu if successful.
300  *      3. Each iommu mapps to this domain if successful.
301  */
302 static struct dmar_domain *si_domain;
303 static int hw_pass_through = 1;
304
305 /*
306  * Domain represents a virtual machine, more than one devices
307  * across iommus may be owned in one domain, e.g. kvm guest.
308  */
309 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
310
311 /* si_domain contains mulitple devices */
312 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
313
314 #define for_each_domain_iommu(idx, domain)                      \
315         for (idx = 0; idx < g_num_of_iommus; idx++)             \
316                 if (domain->iommu_refcnt[idx])
317
318 struct dmar_rmrr_unit {
319         struct list_head list;          /* list of rmrr units   */
320         struct acpi_dmar_header *hdr;   /* ACPI header          */
321         u64     base_address;           /* reserved base address*/
322         u64     end_address;            /* reserved end address */
323         struct dmar_dev_scope *devices; /* target devices */
324         int     devices_cnt;            /* target device count */
325         struct iommu_resv_region *resv; /* reserved region handle */
326 };
327
328 struct dmar_atsr_unit {
329         struct list_head list;          /* list of ATSR units */
330         struct acpi_dmar_header *hdr;   /* ACPI header */
331         struct dmar_dev_scope *devices; /* target devices */
332         int devices_cnt;                /* target device count */
333         u8 include_all:1;               /* include all ports */
334 };
335
336 static LIST_HEAD(dmar_atsr_units);
337 static LIST_HEAD(dmar_rmrr_units);
338
339 #define for_each_rmrr_units(rmrr) \
340         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
341
342 /* bitmap for indexing intel_iommus */
343 static int g_num_of_iommus;
344
345 static void domain_exit(struct dmar_domain *domain);
346 static void domain_remove_dev_info(struct dmar_domain *domain);
347 static void dmar_remove_one_dev_info(struct device *dev);
348 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
349 static void domain_context_clear(struct intel_iommu *iommu,
350                                  struct device *dev);
351 static int domain_detach_iommu(struct dmar_domain *domain,
352                                struct intel_iommu *iommu);
353
354 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
355 int dmar_disabled = 0;
356 #else
357 int dmar_disabled = 1;
358 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
359
360 int intel_iommu_enabled = 0;
361 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
362
363 static int dmar_map_gfx = 1;
364 static int dmar_forcedac;
365 static int intel_iommu_strict;
366 static int intel_iommu_superpage = 1;
367 static int intel_iommu_sm;
368 static int iommu_identity_mapping;
369
370 #define IDENTMAP_ALL            1
371 #define IDENTMAP_GFX            2
372 #define IDENTMAP_AZALIA         4
373
374 #define sm_supported(iommu)     (intel_iommu_sm && ecap_smts((iommu)->ecap))
375 #define pasid_supported(iommu)  (sm_supported(iommu) &&                 \
376                                  ecap_pasid((iommu)->ecap))
377
378 int intel_iommu_gfx_mapped;
379 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
380
381 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
382 static DEFINE_SPINLOCK(device_domain_lock);
383 static LIST_HEAD(device_domain_list);
384
385 /*
386  * Iterate over elements in device_domain_list and call the specified
387  * callback @fn against each element.
388  */
389 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
390                                      void *data), void *data)
391 {
392         int ret = 0;
393         unsigned long flags;
394         struct device_domain_info *info;
395
396         spin_lock_irqsave(&device_domain_lock, flags);
397         list_for_each_entry(info, &device_domain_list, global) {
398                 ret = fn(info, data);
399                 if (ret) {
400                         spin_unlock_irqrestore(&device_domain_lock, flags);
401                         return ret;
402                 }
403         }
404         spin_unlock_irqrestore(&device_domain_lock, flags);
405
406         return 0;
407 }
408
409 const struct iommu_ops intel_iommu_ops;
410
411 static bool translation_pre_enabled(struct intel_iommu *iommu)
412 {
413         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
414 }
415
416 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
417 {
418         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
419 }
420
421 static void init_translation_status(struct intel_iommu *iommu)
422 {
423         u32 gsts;
424
425         gsts = readl(iommu->reg + DMAR_GSTS_REG);
426         if (gsts & DMA_GSTS_TES)
427                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
428 }
429
430 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
431 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
432 {
433         return container_of(dom, struct dmar_domain, domain);
434 }
435
436 static int __init intel_iommu_setup(char *str)
437 {
438         if (!str)
439                 return -EINVAL;
440         while (*str) {
441                 if (!strncmp(str, "on", 2)) {
442                         dmar_disabled = 0;
443                         pr_info("IOMMU enabled\n");
444                 } else if (!strncmp(str, "off", 3)) {
445                         dmar_disabled = 1;
446                         no_platform_optin = 1;
447                         pr_info("IOMMU disabled\n");
448                 } else if (!strncmp(str, "igfx_off", 8)) {
449                         dmar_map_gfx = 0;
450                         pr_info("Disable GFX device mapping\n");
451                 } else if (!strncmp(str, "forcedac", 8)) {
452                         pr_info("Forcing DAC for PCI devices\n");
453                         dmar_forcedac = 1;
454                 } else if (!strncmp(str, "strict", 6)) {
455                         pr_info("Disable batched IOTLB flush\n");
456                         intel_iommu_strict = 1;
457                 } else if (!strncmp(str, "sp_off", 6)) {
458                         pr_info("Disable supported super page\n");
459                         intel_iommu_superpage = 0;
460                 } else if (!strncmp(str, "sm_on", 5)) {
461                         pr_info("Intel-IOMMU: scalable mode supported\n");
462                         intel_iommu_sm = 1;
463                 } else if (!strncmp(str, "tboot_noforce", 13)) {
464                         printk(KERN_INFO
465                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
466                         intel_iommu_tboot_noforce = 1;
467                 }
468
469                 str += strcspn(str, ",");
470                 while (*str == ',')
471                         str++;
472         }
473         return 0;
474 }
475 __setup("intel_iommu=", intel_iommu_setup);
476
477 static struct kmem_cache *iommu_domain_cache;
478 static struct kmem_cache *iommu_devinfo_cache;
479
480 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
481 {
482         struct dmar_domain **domains;
483         int idx = did >> 8;
484
485         domains = iommu->domains[idx];
486         if (!domains)
487                 return NULL;
488
489         return domains[did & 0xff];
490 }
491
492 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
493                              struct dmar_domain *domain)
494 {
495         struct dmar_domain **domains;
496         int idx = did >> 8;
497
498         if (!iommu->domains[idx]) {
499                 size_t size = 256 * sizeof(struct dmar_domain *);
500                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
501         }
502
503         domains = iommu->domains[idx];
504         if (WARN_ON(!domains))
505                 return;
506         else
507                 domains[did & 0xff] = domain;
508 }
509
510 void *alloc_pgtable_page(int node)
511 {
512         struct page *page;
513         void *vaddr = NULL;
514
515         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
516         if (page)
517                 vaddr = page_address(page);
518         return vaddr;
519 }
520
521 void free_pgtable_page(void *vaddr)
522 {
523         free_page((unsigned long)vaddr);
524 }
525
526 static inline void *alloc_domain_mem(void)
527 {
528         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
529 }
530
531 static void free_domain_mem(void *vaddr)
532 {
533         kmem_cache_free(iommu_domain_cache, vaddr);
534 }
535
536 static inline void * alloc_devinfo_mem(void)
537 {
538         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
539 }
540
541 static inline void free_devinfo_mem(void *vaddr)
542 {
543         kmem_cache_free(iommu_devinfo_cache, vaddr);
544 }
545
546 static inline int domain_type_is_vm(struct dmar_domain *domain)
547 {
548         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
549 }
550
551 static inline int domain_type_is_si(struct dmar_domain *domain)
552 {
553         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
554 }
555
556 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
557 {
558         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
559                                 DOMAIN_FLAG_STATIC_IDENTITY);
560 }
561
562 static inline int domain_pfn_supported(struct dmar_domain *domain,
563                                        unsigned long pfn)
564 {
565         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
566
567         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
568 }
569
570 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
571 {
572         unsigned long sagaw;
573         int agaw = -1;
574
575         sagaw = cap_sagaw(iommu->cap);
576         for (agaw = width_to_agaw(max_gaw);
577              agaw >= 0; agaw--) {
578                 if (test_bit(agaw, &sagaw))
579                         break;
580         }
581
582         return agaw;
583 }
584
585 /*
586  * Calculate max SAGAW for each iommu.
587  */
588 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
589 {
590         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
591 }
592
593 /*
594  * calculate agaw for each iommu.
595  * "SAGAW" may be different across iommus, use a default agaw, and
596  * get a supported less agaw for iommus that don't support the default agaw.
597  */
598 int iommu_calculate_agaw(struct intel_iommu *iommu)
599 {
600         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
601 }
602
603 /* This functionin only returns single iommu in a domain */
604 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
605 {
606         int iommu_id;
607
608         /* si_domain and vm domain should not get here. */
609         BUG_ON(domain_type_is_vm_or_si(domain));
610         for_each_domain_iommu(iommu_id, domain)
611                 break;
612
613         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
614                 return NULL;
615
616         return g_iommus[iommu_id];
617 }
618
619 static void domain_update_iommu_coherency(struct dmar_domain *domain)
620 {
621         struct dmar_drhd_unit *drhd;
622         struct intel_iommu *iommu;
623         bool found = false;
624         int i;
625
626         domain->iommu_coherency = 1;
627
628         for_each_domain_iommu(i, domain) {
629                 found = true;
630                 if (!ecap_coherent(g_iommus[i]->ecap)) {
631                         domain->iommu_coherency = 0;
632                         break;
633                 }
634         }
635         if (found)
636                 return;
637
638         /* No hardware attached; use lowest common denominator */
639         rcu_read_lock();
640         for_each_active_iommu(iommu, drhd) {
641                 if (!ecap_coherent(iommu->ecap)) {
642                         domain->iommu_coherency = 0;
643                         break;
644                 }
645         }
646         rcu_read_unlock();
647 }
648
649 static int domain_update_iommu_snooping(struct intel_iommu *skip)
650 {
651         struct dmar_drhd_unit *drhd;
652         struct intel_iommu *iommu;
653         int ret = 1;
654
655         rcu_read_lock();
656         for_each_active_iommu(iommu, drhd) {
657                 if (iommu != skip) {
658                         if (!ecap_sc_support(iommu->ecap)) {
659                                 ret = 0;
660                                 break;
661                         }
662                 }
663         }
664         rcu_read_unlock();
665
666         return ret;
667 }
668
669 static int domain_update_iommu_superpage(struct intel_iommu *skip)
670 {
671         struct dmar_drhd_unit *drhd;
672         struct intel_iommu *iommu;
673         int mask = 0xf;
674
675         if (!intel_iommu_superpage) {
676                 return 0;
677         }
678
679         /* set iommu_superpage to the smallest common denominator */
680         rcu_read_lock();
681         for_each_active_iommu(iommu, drhd) {
682                 if (iommu != skip) {
683                         mask &= cap_super_page_val(iommu->cap);
684                         if (!mask)
685                                 break;
686                 }
687         }
688         rcu_read_unlock();
689
690         return fls(mask);
691 }
692
693 /* Some capabilities may be different across iommus */
694 static void domain_update_iommu_cap(struct dmar_domain *domain)
695 {
696         domain_update_iommu_coherency(domain);
697         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
698         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
699 }
700
701 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
702                                          u8 devfn, int alloc)
703 {
704         struct root_entry *root = &iommu->root_entry[bus];
705         struct context_entry *context;
706         u64 *entry;
707
708         entry = &root->lo;
709         if (sm_supported(iommu)) {
710                 if (devfn >= 0x80) {
711                         devfn -= 0x80;
712                         entry = &root->hi;
713                 }
714                 devfn *= 2;
715         }
716         if (*entry & 1)
717                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
718         else {
719                 unsigned long phy_addr;
720                 if (!alloc)
721                         return NULL;
722
723                 context = alloc_pgtable_page(iommu->node);
724                 if (!context)
725                         return NULL;
726
727                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
728                 phy_addr = virt_to_phys((void *)context);
729                 *entry = phy_addr | 1;
730                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
731         }
732         return &context[devfn];
733 }
734
735 static int iommu_dummy(struct device *dev)
736 {
737         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
738 }
739
740 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
741 {
742         struct dmar_drhd_unit *drhd = NULL;
743         struct intel_iommu *iommu;
744         struct device *tmp;
745         struct pci_dev *ptmp, *pdev = NULL;
746         u16 segment = 0;
747         int i;
748
749         if (iommu_dummy(dev))
750                 return NULL;
751
752         if (dev_is_pci(dev)) {
753                 struct pci_dev *pf_pdev;
754
755                 pdev = to_pci_dev(dev);
756
757 #ifdef CONFIG_X86
758                 /* VMD child devices currently cannot be handled individually */
759                 if (is_vmd(pdev->bus))
760                         return NULL;
761 #endif
762
763                 /* VFs aren't listed in scope tables; we need to look up
764                  * the PF instead to find the IOMMU. */
765                 pf_pdev = pci_physfn(pdev);
766                 dev = &pf_pdev->dev;
767                 segment = pci_domain_nr(pdev->bus);
768         } else if (has_acpi_companion(dev))
769                 dev = &ACPI_COMPANION(dev)->dev;
770
771         rcu_read_lock();
772         for_each_active_iommu(iommu, drhd) {
773                 if (pdev && segment != drhd->segment)
774                         continue;
775
776                 for_each_active_dev_scope(drhd->devices,
777                                           drhd->devices_cnt, i, tmp) {
778                         if (tmp == dev) {
779                                 /* For a VF use its original BDF# not that of the PF
780                                  * which we used for the IOMMU lookup. Strictly speaking
781                                  * we could do this for all PCI devices; we only need to
782                                  * get the BDF# from the scope table for ACPI matches. */
783                                 if (pdev && pdev->is_virtfn)
784                                         goto got_pdev;
785
786                                 *bus = drhd->devices[i].bus;
787                                 *devfn = drhd->devices[i].devfn;
788                                 goto out;
789                         }
790
791                         if (!pdev || !dev_is_pci(tmp))
792                                 continue;
793
794                         ptmp = to_pci_dev(tmp);
795                         if (ptmp->subordinate &&
796                             ptmp->subordinate->number <= pdev->bus->number &&
797                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
798                                 goto got_pdev;
799                 }
800
801                 if (pdev && drhd->include_all) {
802                 got_pdev:
803                         *bus = pdev->bus->number;
804                         *devfn = pdev->devfn;
805                         goto out;
806                 }
807         }
808         iommu = NULL;
809  out:
810         rcu_read_unlock();
811
812         return iommu;
813 }
814
815 static void domain_flush_cache(struct dmar_domain *domain,
816                                void *addr, int size)
817 {
818         if (!domain->iommu_coherency)
819                 clflush_cache_range(addr, size);
820 }
821
822 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
823 {
824         struct context_entry *context;
825         int ret = 0;
826         unsigned long flags;
827
828         spin_lock_irqsave(&iommu->lock, flags);
829         context = iommu_context_addr(iommu, bus, devfn, 0);
830         if (context)
831                 ret = context_present(context);
832         spin_unlock_irqrestore(&iommu->lock, flags);
833         return ret;
834 }
835
836 static void free_context_table(struct intel_iommu *iommu)
837 {
838         int i;
839         unsigned long flags;
840         struct context_entry *context;
841
842         spin_lock_irqsave(&iommu->lock, flags);
843         if (!iommu->root_entry) {
844                 goto out;
845         }
846         for (i = 0; i < ROOT_ENTRY_NR; i++) {
847                 context = iommu_context_addr(iommu, i, 0, 0);
848                 if (context)
849                         free_pgtable_page(context);
850
851                 if (!sm_supported(iommu))
852                         continue;
853
854                 context = iommu_context_addr(iommu, i, 0x80, 0);
855                 if (context)
856                         free_pgtable_page(context);
857
858         }
859         free_pgtable_page(iommu->root_entry);
860         iommu->root_entry = NULL;
861 out:
862         spin_unlock_irqrestore(&iommu->lock, flags);
863 }
864
865 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
866                                       unsigned long pfn, int *target_level)
867 {
868         struct dma_pte *parent, *pte;
869         int level = agaw_to_level(domain->agaw);
870         int offset;
871
872         BUG_ON(!domain->pgd);
873
874         if (!domain_pfn_supported(domain, pfn))
875                 /* Address beyond IOMMU's addressing capabilities. */
876                 return NULL;
877
878         parent = domain->pgd;
879
880         while (1) {
881                 void *tmp_page;
882
883                 offset = pfn_level_offset(pfn, level);
884                 pte = &parent[offset];
885                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
886                         break;
887                 if (level == *target_level)
888                         break;
889
890                 if (!dma_pte_present(pte)) {
891                         uint64_t pteval;
892
893                         tmp_page = alloc_pgtable_page(domain->nid);
894
895                         if (!tmp_page)
896                                 return NULL;
897
898                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
899                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
900                         if (cmpxchg64(&pte->val, 0ULL, pteval))
901                                 /* Someone else set it while we were thinking; use theirs. */
902                                 free_pgtable_page(tmp_page);
903                         else
904                                 domain_flush_cache(domain, pte, sizeof(*pte));
905                 }
906                 if (level == 1)
907                         break;
908
909                 parent = phys_to_virt(dma_pte_addr(pte));
910                 level--;
911         }
912
913         if (!*target_level)
914                 *target_level = level;
915
916         return pte;
917 }
918
919
920 /* return address's pte at specific level */
921 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
922                                          unsigned long pfn,
923                                          int level, int *large_page)
924 {
925         struct dma_pte *parent, *pte;
926         int total = agaw_to_level(domain->agaw);
927         int offset;
928
929         parent = domain->pgd;
930         while (level <= total) {
931                 offset = pfn_level_offset(pfn, total);
932                 pte = &parent[offset];
933                 if (level == total)
934                         return pte;
935
936                 if (!dma_pte_present(pte)) {
937                         *large_page = total;
938                         break;
939                 }
940
941                 if (dma_pte_superpage(pte)) {
942                         *large_page = total;
943                         return pte;
944                 }
945
946                 parent = phys_to_virt(dma_pte_addr(pte));
947                 total--;
948         }
949         return NULL;
950 }
951
952 /* clear last level pte, a tlb flush should be followed */
953 static void dma_pte_clear_range(struct dmar_domain *domain,
954                                 unsigned long start_pfn,
955                                 unsigned long last_pfn)
956 {
957         unsigned int large_page;
958         struct dma_pte *first_pte, *pte;
959
960         BUG_ON(!domain_pfn_supported(domain, start_pfn));
961         BUG_ON(!domain_pfn_supported(domain, last_pfn));
962         BUG_ON(start_pfn > last_pfn);
963
964         /* we don't need lock here; nobody else touches the iova range */
965         do {
966                 large_page = 1;
967                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
968                 if (!pte) {
969                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
970                         continue;
971                 }
972                 do {
973                         dma_clear_pte(pte);
974                         start_pfn += lvl_to_nr_pages(large_page);
975                         pte++;
976                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
977
978                 domain_flush_cache(domain, first_pte,
979                                    (void *)pte - (void *)first_pte);
980
981         } while (start_pfn && start_pfn <= last_pfn);
982 }
983
984 static void dma_pte_free_level(struct dmar_domain *domain, int level,
985                                int retain_level, struct dma_pte *pte,
986                                unsigned long pfn, unsigned long start_pfn,
987                                unsigned long last_pfn)
988 {
989         pfn = max(start_pfn, pfn);
990         pte = &pte[pfn_level_offset(pfn, level)];
991
992         do {
993                 unsigned long level_pfn;
994                 struct dma_pte *level_pte;
995
996                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
997                         goto next;
998
999                 level_pfn = pfn & level_mask(level);
1000                 level_pte = phys_to_virt(dma_pte_addr(pte));
1001
1002                 if (level > 2) {
1003                         dma_pte_free_level(domain, level - 1, retain_level,
1004                                            level_pte, level_pfn, start_pfn,
1005                                            last_pfn);
1006                 }
1007
1008                 /*
1009                  * Free the page table if we're below the level we want to
1010                  * retain and the range covers the entire table.
1011                  */
1012                 if (level < retain_level && !(start_pfn > level_pfn ||
1013                       last_pfn < level_pfn + level_size(level) - 1)) {
1014                         dma_clear_pte(pte);
1015                         domain_flush_cache(domain, pte, sizeof(*pte));
1016                         free_pgtable_page(level_pte);
1017                 }
1018 next:
1019                 pfn += level_size(level);
1020         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1021 }
1022
1023 /*
1024  * clear last level (leaf) ptes and free page table pages below the
1025  * level we wish to keep intact.
1026  */
1027 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1028                                    unsigned long start_pfn,
1029                                    unsigned long last_pfn,
1030                                    int retain_level)
1031 {
1032         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1033         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1034         BUG_ON(start_pfn > last_pfn);
1035
1036         dma_pte_clear_range(domain, start_pfn, last_pfn);
1037
1038         /* We don't need lock here; nobody else touches the iova range */
1039         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1040                            domain->pgd, 0, start_pfn, last_pfn);
1041
1042         /* free pgd */
1043         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1044                 free_pgtable_page(domain->pgd);
1045                 domain->pgd = NULL;
1046         }
1047 }
1048
1049 /* When a page at a given level is being unlinked from its parent, we don't
1050    need to *modify* it at all. All we need to do is make a list of all the
1051    pages which can be freed just as soon as we've flushed the IOTLB and we
1052    know the hardware page-walk will no longer touch them.
1053    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1054    be freed. */
1055 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1056                                             int level, struct dma_pte *pte,
1057                                             struct page *freelist)
1058 {
1059         struct page *pg;
1060
1061         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1062         pg->freelist = freelist;
1063         freelist = pg;
1064
1065         if (level == 1)
1066                 return freelist;
1067
1068         pte = page_address(pg);
1069         do {
1070                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1071                         freelist = dma_pte_list_pagetables(domain, level - 1,
1072                                                            pte, freelist);
1073                 pte++;
1074         } while (!first_pte_in_page(pte));
1075
1076         return freelist;
1077 }
1078
1079 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1080                                         struct dma_pte *pte, unsigned long pfn,
1081                                         unsigned long start_pfn,
1082                                         unsigned long last_pfn,
1083                                         struct page *freelist)
1084 {
1085         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1086
1087         pfn = max(start_pfn, pfn);
1088         pte = &pte[pfn_level_offset(pfn, level)];
1089
1090         do {
1091                 unsigned long level_pfn;
1092
1093                 if (!dma_pte_present(pte))
1094                         goto next;
1095
1096                 level_pfn = pfn & level_mask(level);
1097
1098                 /* If range covers entire pagetable, free it */
1099                 if (start_pfn <= level_pfn &&
1100                     last_pfn >= level_pfn + level_size(level) - 1) {
1101                         /* These suborbinate page tables are going away entirely. Don't
1102                            bother to clear them; we're just going to *free* them. */
1103                         if (level > 1 && !dma_pte_superpage(pte))
1104                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1105
1106                         dma_clear_pte(pte);
1107                         if (!first_pte)
1108                                 first_pte = pte;
1109                         last_pte = pte;
1110                 } else if (level > 1) {
1111                         /* Recurse down into a level that isn't *entirely* obsolete */
1112                         freelist = dma_pte_clear_level(domain, level - 1,
1113                                                        phys_to_virt(dma_pte_addr(pte)),
1114                                                        level_pfn, start_pfn, last_pfn,
1115                                                        freelist);
1116                 }
1117 next:
1118                 pfn += level_size(level);
1119         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1120
1121         if (first_pte)
1122                 domain_flush_cache(domain, first_pte,
1123                                    (void *)++last_pte - (void *)first_pte);
1124
1125         return freelist;
1126 }
1127
1128 /* We can't just free the pages because the IOMMU may still be walking
1129    the page tables, and may have cached the intermediate levels. The
1130    pages can only be freed after the IOTLB flush has been done. */
1131 static struct page *domain_unmap(struct dmar_domain *domain,
1132                                  unsigned long start_pfn,
1133                                  unsigned long last_pfn)
1134 {
1135         struct page *freelist;
1136
1137         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1138         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1139         BUG_ON(start_pfn > last_pfn);
1140
1141         /* we don't need lock here; nobody else touches the iova range */
1142         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1143                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1144
1145         /* free pgd */
1146         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1147                 struct page *pgd_page = virt_to_page(domain->pgd);
1148                 pgd_page->freelist = freelist;
1149                 freelist = pgd_page;
1150
1151                 domain->pgd = NULL;
1152         }
1153
1154         return freelist;
1155 }
1156
1157 static void dma_free_pagelist(struct page *freelist)
1158 {
1159         struct page *pg;
1160
1161         while ((pg = freelist)) {
1162                 freelist = pg->freelist;
1163                 free_pgtable_page(page_address(pg));
1164         }
1165 }
1166
1167 static void iova_entry_free(unsigned long data)
1168 {
1169         struct page *freelist = (struct page *)data;
1170
1171         dma_free_pagelist(freelist);
1172 }
1173
1174 /* iommu handling */
1175 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1176 {
1177         struct root_entry *root;
1178         unsigned long flags;
1179
1180         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1181         if (!root) {
1182                 pr_err("Allocating root entry for %s failed\n",
1183                         iommu->name);
1184                 return -ENOMEM;
1185         }
1186
1187         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1188
1189         spin_lock_irqsave(&iommu->lock, flags);
1190         iommu->root_entry = root;
1191         spin_unlock_irqrestore(&iommu->lock, flags);
1192
1193         return 0;
1194 }
1195
1196 static void iommu_set_root_entry(struct intel_iommu *iommu)
1197 {
1198         u64 addr;
1199         u32 sts;
1200         unsigned long flag;
1201
1202         addr = virt_to_phys(iommu->root_entry);
1203         if (sm_supported(iommu))
1204                 addr |= DMA_RTADDR_SMT;
1205
1206         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1207         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1208
1209         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1210
1211         /* Make sure hardware complete it */
1212         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1213                       readl, (sts & DMA_GSTS_RTPS), sts);
1214
1215         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1216 }
1217
1218 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1219 {
1220         u32 val;
1221         unsigned long flag;
1222
1223         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1224                 return;
1225
1226         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1227         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1228
1229         /* Make sure hardware complete it */
1230         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1231                       readl, (!(val & DMA_GSTS_WBFS)), val);
1232
1233         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1234 }
1235
1236 /* return value determine if we need a write buffer flush */
1237 static void __iommu_flush_context(struct intel_iommu *iommu,
1238                                   u16 did, u16 source_id, u8 function_mask,
1239                                   u64 type)
1240 {
1241         u64 val = 0;
1242         unsigned long flag;
1243
1244         switch (type) {
1245         case DMA_CCMD_GLOBAL_INVL:
1246                 val = DMA_CCMD_GLOBAL_INVL;
1247                 break;
1248         case DMA_CCMD_DOMAIN_INVL:
1249                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1250                 break;
1251         case DMA_CCMD_DEVICE_INVL:
1252                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1253                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1254                 break;
1255         default:
1256                 BUG();
1257         }
1258         val |= DMA_CCMD_ICC;
1259
1260         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1261         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1262
1263         /* Make sure hardware complete it */
1264         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1265                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1266
1267         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1268 }
1269
1270 /* return value determine if we need a write buffer flush */
1271 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1272                                 u64 addr, unsigned int size_order, u64 type)
1273 {
1274         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1275         u64 val = 0, val_iva = 0;
1276         unsigned long flag;
1277
1278         switch (type) {
1279         case DMA_TLB_GLOBAL_FLUSH:
1280                 /* global flush doesn't need set IVA_REG */
1281                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1282                 break;
1283         case DMA_TLB_DSI_FLUSH:
1284                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1285                 break;
1286         case DMA_TLB_PSI_FLUSH:
1287                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1288                 /* IH bit is passed in as part of address */
1289                 val_iva = size_order | addr;
1290                 break;
1291         default:
1292                 BUG();
1293         }
1294         /* Note: set drain read/write */
1295 #if 0
1296         /*
1297          * This is probably to be super secure.. Looks like we can
1298          * ignore it without any impact.
1299          */
1300         if (cap_read_drain(iommu->cap))
1301                 val |= DMA_TLB_READ_DRAIN;
1302 #endif
1303         if (cap_write_drain(iommu->cap))
1304                 val |= DMA_TLB_WRITE_DRAIN;
1305
1306         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1307         /* Note: Only uses first TLB reg currently */
1308         if (val_iva)
1309                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1310         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1311
1312         /* Make sure hardware complete it */
1313         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1314                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1315
1316         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1317
1318         /* check IOTLB invalidation granularity */
1319         if (DMA_TLB_IAIG(val) == 0)
1320                 pr_err("Flush IOTLB failed\n");
1321         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1322                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1323                         (unsigned long long)DMA_TLB_IIRG(type),
1324                         (unsigned long long)DMA_TLB_IAIG(val));
1325 }
1326
1327 static struct device_domain_info *
1328 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1329                          u8 bus, u8 devfn)
1330 {
1331         struct device_domain_info *info;
1332
1333         assert_spin_locked(&device_domain_lock);
1334
1335         if (!iommu->qi)
1336                 return NULL;
1337
1338         list_for_each_entry(info, &domain->devices, link)
1339                 if (info->iommu == iommu && info->bus == bus &&
1340                     info->devfn == devfn) {
1341                         if (info->ats_supported && info->dev)
1342                                 return info;
1343                         break;
1344                 }
1345
1346         return NULL;
1347 }
1348
1349 static void domain_update_iotlb(struct dmar_domain *domain)
1350 {
1351         struct device_domain_info *info;
1352         bool has_iotlb_device = false;
1353
1354         assert_spin_locked(&device_domain_lock);
1355
1356         list_for_each_entry(info, &domain->devices, link) {
1357                 struct pci_dev *pdev;
1358
1359                 if (!info->dev || !dev_is_pci(info->dev))
1360                         continue;
1361
1362                 pdev = to_pci_dev(info->dev);
1363                 if (pdev->ats_enabled) {
1364                         has_iotlb_device = true;
1365                         break;
1366                 }
1367         }
1368
1369         domain->has_iotlb_device = has_iotlb_device;
1370 }
1371
1372 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1373 {
1374         struct pci_dev *pdev;
1375
1376         assert_spin_locked(&device_domain_lock);
1377
1378         if (!info || !dev_is_pci(info->dev))
1379                 return;
1380
1381         pdev = to_pci_dev(info->dev);
1382         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1383          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1384          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1385          * reserved, which should be set to 0.
1386          */
1387         if (!ecap_dit(info->iommu->ecap))
1388                 info->pfsid = 0;
1389         else {
1390                 struct pci_dev *pf_pdev;
1391
1392                 /* pdev will be returned if device is not a vf */
1393                 pf_pdev = pci_physfn(pdev);
1394                 info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
1395         }
1396
1397 #ifdef CONFIG_INTEL_IOMMU_SVM
1398         /* The PCIe spec, in its wisdom, declares that the behaviour of
1399            the device if you enable PASID support after ATS support is
1400            undefined. So always enable PASID support on devices which
1401            have it, even if we can't yet know if we're ever going to
1402            use it. */
1403         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1404                 info->pasid_enabled = 1;
1405
1406         if (info->pri_supported &&
1407             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1408             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1409                 info->pri_enabled = 1;
1410 #endif
1411         if (!pdev->untrusted && info->ats_supported &&
1412             pci_ats_page_aligned(pdev) &&
1413             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1414                 info->ats_enabled = 1;
1415                 domain_update_iotlb(info->domain);
1416                 info->ats_qdep = pci_ats_queue_depth(pdev);
1417         }
1418 }
1419
1420 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1421 {
1422         struct pci_dev *pdev;
1423
1424         assert_spin_locked(&device_domain_lock);
1425
1426         if (!dev_is_pci(info->dev))
1427                 return;
1428
1429         pdev = to_pci_dev(info->dev);
1430
1431         if (info->ats_enabled) {
1432                 pci_disable_ats(pdev);
1433                 info->ats_enabled = 0;
1434                 domain_update_iotlb(info->domain);
1435         }
1436 #ifdef CONFIG_INTEL_IOMMU_SVM
1437         if (info->pri_enabled) {
1438                 pci_disable_pri(pdev);
1439                 info->pri_enabled = 0;
1440         }
1441         if (info->pasid_enabled) {
1442                 pci_disable_pasid(pdev);
1443                 info->pasid_enabled = 0;
1444         }
1445 #endif
1446 }
1447
1448 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1449                                   u64 addr, unsigned mask)
1450 {
1451         u16 sid, qdep;
1452         unsigned long flags;
1453         struct device_domain_info *info;
1454
1455         if (!domain->has_iotlb_device)
1456                 return;
1457
1458         spin_lock_irqsave(&device_domain_lock, flags);
1459         list_for_each_entry(info, &domain->devices, link) {
1460                 if (!info->ats_enabled)
1461                         continue;
1462
1463                 sid = info->bus << 8 | info->devfn;
1464                 qdep = info->ats_qdep;
1465                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1466                                 qdep, addr, mask);
1467         }
1468         spin_unlock_irqrestore(&device_domain_lock, flags);
1469 }
1470
1471 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1472                                   struct dmar_domain *domain,
1473                                   unsigned long pfn, unsigned int pages,
1474                                   int ih, int map)
1475 {
1476         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1477         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1478         u16 did = domain->iommu_did[iommu->seq_id];
1479
1480         BUG_ON(pages == 0);
1481
1482         if (ih)
1483                 ih = 1 << 6;
1484         /*
1485          * Fallback to domain selective flush if no PSI support or the size is
1486          * too big.
1487          * PSI requires page size to be 2 ^ x, and the base address is naturally
1488          * aligned to the size
1489          */
1490         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1491                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1492                                                 DMA_TLB_DSI_FLUSH);
1493         else
1494                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1495                                                 DMA_TLB_PSI_FLUSH);
1496
1497         /*
1498          * In caching mode, changes of pages from non-present to present require
1499          * flush. However, device IOTLB doesn't need to be flushed in this case.
1500          */
1501         if (!cap_caching_mode(iommu->cap) || !map)
1502                 iommu_flush_dev_iotlb(domain, addr, mask);
1503 }
1504
1505 /* Notification for newly created mappings */
1506 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1507                                         struct dmar_domain *domain,
1508                                         unsigned long pfn, unsigned int pages)
1509 {
1510         /* It's a non-present to present mapping. Only flush if caching mode */
1511         if (cap_caching_mode(iommu->cap))
1512                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1513         else
1514                 iommu_flush_write_buffer(iommu);
1515 }
1516
1517 static void iommu_flush_iova(struct iova_domain *iovad)
1518 {
1519         struct dmar_domain *domain;
1520         int idx;
1521
1522         domain = container_of(iovad, struct dmar_domain, iovad);
1523
1524         for_each_domain_iommu(idx, domain) {
1525                 struct intel_iommu *iommu = g_iommus[idx];
1526                 u16 did = domain->iommu_did[iommu->seq_id];
1527
1528                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1529
1530                 if (!cap_caching_mode(iommu->cap))
1531                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1532                                               0, MAX_AGAW_PFN_WIDTH);
1533         }
1534 }
1535
1536 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1537 {
1538         u32 pmen;
1539         unsigned long flags;
1540
1541         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1542                 return;
1543
1544         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1545         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1546         pmen &= ~DMA_PMEN_EPM;
1547         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1548
1549         /* wait for the protected region status bit to clear */
1550         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1551                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1552
1553         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1554 }
1555
1556 static void iommu_enable_translation(struct intel_iommu *iommu)
1557 {
1558         u32 sts;
1559         unsigned long flags;
1560
1561         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1562         iommu->gcmd |= DMA_GCMD_TE;
1563         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1564
1565         /* Make sure hardware complete it */
1566         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1567                       readl, (sts & DMA_GSTS_TES), sts);
1568
1569         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1570 }
1571
1572 static void iommu_disable_translation(struct intel_iommu *iommu)
1573 {
1574         u32 sts;
1575         unsigned long flag;
1576
1577         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1578         iommu->gcmd &= ~DMA_GCMD_TE;
1579         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1580
1581         /* Make sure hardware complete it */
1582         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1583                       readl, (!(sts & DMA_GSTS_TES)), sts);
1584
1585         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1586 }
1587
1588
1589 static int iommu_init_domains(struct intel_iommu *iommu)
1590 {
1591         u32 ndomains, nlongs;
1592         size_t size;
1593
1594         ndomains = cap_ndoms(iommu->cap);
1595         pr_debug("%s: Number of Domains supported <%d>\n",
1596                  iommu->name, ndomains);
1597         nlongs = BITS_TO_LONGS(ndomains);
1598
1599         spin_lock_init(&iommu->lock);
1600
1601         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1602         if (!iommu->domain_ids) {
1603                 pr_err("%s: Allocating domain id array failed\n",
1604                        iommu->name);
1605                 return -ENOMEM;
1606         }
1607
1608         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1609         iommu->domains = kzalloc(size, GFP_KERNEL);
1610
1611         if (iommu->domains) {
1612                 size = 256 * sizeof(struct dmar_domain *);
1613                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1614         }
1615
1616         if (!iommu->domains || !iommu->domains[0]) {
1617                 pr_err("%s: Allocating domain array failed\n",
1618                        iommu->name);
1619                 kfree(iommu->domain_ids);
1620                 kfree(iommu->domains);
1621                 iommu->domain_ids = NULL;
1622                 iommu->domains    = NULL;
1623                 return -ENOMEM;
1624         }
1625
1626
1627
1628         /*
1629          * If Caching mode is set, then invalid translations are tagged
1630          * with domain-id 0, hence we need to pre-allocate it. We also
1631          * use domain-id 0 as a marker for non-allocated domain-id, so
1632          * make sure it is not used for a real domain.
1633          */
1634         set_bit(0, iommu->domain_ids);
1635
1636         /*
1637          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1638          * entry for first-level or pass-through translation modes should
1639          * be programmed with a domain id different from those used for
1640          * second-level or nested translation. We reserve a domain id for
1641          * this purpose.
1642          */
1643         if (sm_supported(iommu))
1644                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1645
1646         return 0;
1647 }
1648
1649 static void disable_dmar_iommu(struct intel_iommu *iommu)
1650 {
1651         struct device_domain_info *info, *tmp;
1652         unsigned long flags;
1653
1654         if (!iommu->domains || !iommu->domain_ids)
1655                 return;
1656
1657 again:
1658         spin_lock_irqsave(&device_domain_lock, flags);
1659         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1660                 struct dmar_domain *domain;
1661
1662                 if (info->iommu != iommu)
1663                         continue;
1664
1665                 if (!info->dev || !info->domain)
1666                         continue;
1667
1668                 domain = info->domain;
1669
1670                 __dmar_remove_one_dev_info(info);
1671
1672                 if (!domain_type_is_vm_or_si(domain)) {
1673                         /*
1674                          * The domain_exit() function  can't be called under
1675                          * device_domain_lock, as it takes this lock itself.
1676                          * So release the lock here and re-run the loop
1677                          * afterwards.
1678                          */
1679                         spin_unlock_irqrestore(&device_domain_lock, flags);
1680                         domain_exit(domain);
1681                         goto again;
1682                 }
1683         }
1684         spin_unlock_irqrestore(&device_domain_lock, flags);
1685
1686         if (iommu->gcmd & DMA_GCMD_TE)
1687                 iommu_disable_translation(iommu);
1688 }
1689
1690 static void free_dmar_iommu(struct intel_iommu *iommu)
1691 {
1692         if ((iommu->domains) && (iommu->domain_ids)) {
1693                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1694                 int i;
1695
1696                 for (i = 0; i < elems; i++)
1697                         kfree(iommu->domains[i]);
1698                 kfree(iommu->domains);
1699                 kfree(iommu->domain_ids);
1700                 iommu->domains = NULL;
1701                 iommu->domain_ids = NULL;
1702         }
1703
1704         g_iommus[iommu->seq_id] = NULL;
1705
1706         /* free context mapping */
1707         free_context_table(iommu);
1708
1709 #ifdef CONFIG_INTEL_IOMMU_SVM
1710         if (pasid_supported(iommu)) {
1711                 if (ecap_prs(iommu->ecap))
1712                         intel_svm_finish_prq(iommu);
1713         }
1714 #endif
1715 }
1716
1717 static struct dmar_domain *alloc_domain(int flags)
1718 {
1719         struct dmar_domain *domain;
1720
1721         domain = alloc_domain_mem();
1722         if (!domain)
1723                 return NULL;
1724
1725         memset(domain, 0, sizeof(*domain));
1726         domain->nid = NUMA_NO_NODE;
1727         domain->flags = flags;
1728         domain->has_iotlb_device = false;
1729         INIT_LIST_HEAD(&domain->devices);
1730
1731         return domain;
1732 }
1733
1734 /* Must be called with iommu->lock */
1735 static int domain_attach_iommu(struct dmar_domain *domain,
1736                                struct intel_iommu *iommu)
1737 {
1738         unsigned long ndomains;
1739         int num;
1740
1741         assert_spin_locked(&device_domain_lock);
1742         assert_spin_locked(&iommu->lock);
1743
1744         domain->iommu_refcnt[iommu->seq_id] += 1;
1745         domain->iommu_count += 1;
1746         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1747                 ndomains = cap_ndoms(iommu->cap);
1748                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1749
1750                 if (num >= ndomains) {
1751                         pr_err("%s: No free domain ids\n", iommu->name);
1752                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1753                         domain->iommu_count -= 1;
1754                         return -ENOSPC;
1755                 }
1756
1757                 set_bit(num, iommu->domain_ids);
1758                 set_iommu_domain(iommu, num, domain);
1759
1760                 domain->iommu_did[iommu->seq_id] = num;
1761                 domain->nid                      = iommu->node;
1762
1763                 domain_update_iommu_cap(domain);
1764         }
1765
1766         return 0;
1767 }
1768
1769 static int domain_detach_iommu(struct dmar_domain *domain,
1770                                struct intel_iommu *iommu)
1771 {
1772         int num, count;
1773
1774         assert_spin_locked(&device_domain_lock);
1775         assert_spin_locked(&iommu->lock);
1776
1777         domain->iommu_refcnt[iommu->seq_id] -= 1;
1778         count = --domain->iommu_count;
1779         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1780                 num = domain->iommu_did[iommu->seq_id];
1781                 clear_bit(num, iommu->domain_ids);
1782                 set_iommu_domain(iommu, num, NULL);
1783
1784                 domain_update_iommu_cap(domain);
1785                 domain->iommu_did[iommu->seq_id] = 0;
1786         }
1787
1788         return count;
1789 }
1790
1791 static struct iova_domain reserved_iova_list;
1792 static struct lock_class_key reserved_rbtree_key;
1793
1794 static int dmar_init_reserved_ranges(void)
1795 {
1796         struct pci_dev *pdev = NULL;
1797         struct iova *iova;
1798         int i;
1799
1800         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1801
1802         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1803                 &reserved_rbtree_key);
1804
1805         /* IOAPIC ranges shouldn't be accessed by DMA */
1806         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1807                 IOVA_PFN(IOAPIC_RANGE_END));
1808         if (!iova) {
1809                 pr_err("Reserve IOAPIC range failed\n");
1810                 return -ENODEV;
1811         }
1812
1813         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1814         for_each_pci_dev(pdev) {
1815                 struct resource *r;
1816
1817                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1818                         r = &pdev->resource[i];
1819                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1820                                 continue;
1821                         iova = reserve_iova(&reserved_iova_list,
1822                                             IOVA_PFN(r->start),
1823                                             IOVA_PFN(r->end));
1824                         if (!iova) {
1825                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1826                                 return -ENODEV;
1827                         }
1828                 }
1829         }
1830         return 0;
1831 }
1832
1833 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1834 {
1835         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1836 }
1837
1838 static inline int guestwidth_to_adjustwidth(int gaw)
1839 {
1840         int agaw;
1841         int r = (gaw - 12) % 9;
1842
1843         if (r == 0)
1844                 agaw = gaw;
1845         else
1846                 agaw = gaw + 9 - r;
1847         if (agaw > 64)
1848                 agaw = 64;
1849         return agaw;
1850 }
1851
1852 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1853                        int guest_width)
1854 {
1855         int adjust_width, agaw;
1856         unsigned long sagaw;
1857         int err;
1858
1859         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1860
1861         err = init_iova_flush_queue(&domain->iovad,
1862                                     iommu_flush_iova, iova_entry_free);
1863         if (err)
1864                 return err;
1865
1866         domain_reserve_special_ranges(domain);
1867
1868         /* calculate AGAW */
1869         if (guest_width > cap_mgaw(iommu->cap))
1870                 guest_width = cap_mgaw(iommu->cap);
1871         domain->gaw = guest_width;
1872         adjust_width = guestwidth_to_adjustwidth(guest_width);
1873         agaw = width_to_agaw(adjust_width);
1874         sagaw = cap_sagaw(iommu->cap);
1875         if (!test_bit(agaw, &sagaw)) {
1876                 /* hardware doesn't support it, choose a bigger one */
1877                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1878                 agaw = find_next_bit(&sagaw, 5, agaw);
1879                 if (agaw >= 5)
1880                         return -ENODEV;
1881         }
1882         domain->agaw = agaw;
1883
1884         if (ecap_coherent(iommu->ecap))
1885                 domain->iommu_coherency = 1;
1886         else
1887                 domain->iommu_coherency = 0;
1888
1889         if (ecap_sc_support(iommu->ecap))
1890                 domain->iommu_snooping = 1;
1891         else
1892                 domain->iommu_snooping = 0;
1893
1894         if (intel_iommu_superpage)
1895                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1896         else
1897                 domain->iommu_superpage = 0;
1898
1899         domain->nid = iommu->node;
1900
1901         /* always allocate the top pgd */
1902         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1903         if (!domain->pgd)
1904                 return -ENOMEM;
1905         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1906         return 0;
1907 }
1908
1909 static void domain_exit(struct dmar_domain *domain)
1910 {
1911         struct page *freelist;
1912
1913         /* Remove associated devices and clear attached or cached domains */
1914         rcu_read_lock();
1915         domain_remove_dev_info(domain);
1916         rcu_read_unlock();
1917
1918         /* destroy iovas */
1919         put_iova_domain(&domain->iovad);
1920
1921         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1922
1923         dma_free_pagelist(freelist);
1924
1925         free_domain_mem(domain);
1926 }
1927
1928 /*
1929  * Get the PASID directory size for scalable mode context entry.
1930  * Value of X in the PDTS field of a scalable mode context entry
1931  * indicates PASID directory with 2^(X + 7) entries.
1932  */
1933 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1934 {
1935         int pds, max_pde;
1936
1937         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1938         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1939         if (pds < 7)
1940                 return 0;
1941
1942         return pds - 7;
1943 }
1944
1945 /*
1946  * Set the RID_PASID field of a scalable mode context entry. The
1947  * IOMMU hardware will use the PASID value set in this field for
1948  * DMA translations of DMA requests without PASID.
1949  */
1950 static inline void
1951 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1952 {
1953         context->hi |= pasid & ((1 << 20) - 1);
1954         context->hi |= (1 << 20);
1955 }
1956
1957 /*
1958  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1959  * entry.
1960  */
1961 static inline void context_set_sm_dte(struct context_entry *context)
1962 {
1963         context->lo |= (1 << 2);
1964 }
1965
1966 /*
1967  * Set the PRE(Page Request Enable) field of a scalable mode context
1968  * entry.
1969  */
1970 static inline void context_set_sm_pre(struct context_entry *context)
1971 {
1972         context->lo |= (1 << 4);
1973 }
1974
1975 /* Convert value to context PASID directory size field coding. */
1976 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1977
1978 static int domain_context_mapping_one(struct dmar_domain *domain,
1979                                       struct intel_iommu *iommu,
1980                                       struct pasid_table *table,
1981                                       u8 bus, u8 devfn)
1982 {
1983         u16 did = domain->iommu_did[iommu->seq_id];
1984         int translation = CONTEXT_TT_MULTI_LEVEL;
1985         struct device_domain_info *info = NULL;
1986         struct context_entry *context;
1987         unsigned long flags;
1988         int ret;
1989
1990         WARN_ON(did == 0);
1991
1992         if (hw_pass_through && domain_type_is_si(domain))
1993                 translation = CONTEXT_TT_PASS_THROUGH;
1994
1995         pr_debug("Set context mapping for %02x:%02x.%d\n",
1996                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1997
1998         BUG_ON(!domain->pgd);
1999
2000         spin_lock_irqsave(&device_domain_lock, flags);
2001         spin_lock(&iommu->lock);
2002
2003         ret = -ENOMEM;
2004         context = iommu_context_addr(iommu, bus, devfn, 1);
2005         if (!context)
2006                 goto out_unlock;
2007
2008         ret = 0;
2009         if (context_present(context))
2010                 goto out_unlock;
2011
2012         /*
2013          * For kdump cases, old valid entries may be cached due to the
2014          * in-flight DMA and copied pgtable, but there is no unmapping
2015          * behaviour for them, thus we need an explicit cache flush for
2016          * the newly-mapped device. For kdump, at this point, the device
2017          * is supposed to finish reset at its driver probe stage, so no
2018          * in-flight DMA will exist, and we don't need to worry anymore
2019          * hereafter.
2020          */
2021         if (context_copied(context)) {
2022                 u16 did_old = context_domain_id(context);
2023
2024                 if (did_old < cap_ndoms(iommu->cap)) {
2025                         iommu->flush.flush_context(iommu, did_old,
2026                                                    (((u16)bus) << 8) | devfn,
2027                                                    DMA_CCMD_MASK_NOBIT,
2028                                                    DMA_CCMD_DEVICE_INVL);
2029                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2030                                                  DMA_TLB_DSI_FLUSH);
2031                 }
2032         }
2033
2034         context_clear_entry(context);
2035
2036         if (sm_supported(iommu)) {
2037                 unsigned long pds;
2038
2039                 WARN_ON(!table);
2040
2041                 /* Setup the PASID DIR pointer: */
2042                 pds = context_get_sm_pds(table);
2043                 context->lo = (u64)virt_to_phys(table->table) |
2044                                 context_pdts(pds);
2045
2046                 /* Setup the RID_PASID field: */
2047                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2048
2049                 /*
2050                  * Setup the Device-TLB enable bit and Page request
2051                  * Enable bit:
2052                  */
2053                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2054                 if (info && info->ats_supported)
2055                         context_set_sm_dte(context);
2056                 if (info && info->pri_supported)
2057                         context_set_sm_pre(context);
2058         } else {
2059                 struct dma_pte *pgd = domain->pgd;
2060                 int agaw;
2061
2062                 context_set_domain_id(context, did);
2063
2064                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2065                         /*
2066                          * Skip top levels of page tables for iommu which has
2067                          * less agaw than default. Unnecessary for PT mode.
2068                          */
2069                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2070                                 ret = -ENOMEM;
2071                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2072                                 if (!dma_pte_present(pgd))
2073                                         goto out_unlock;
2074                         }
2075
2076                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2077                         if (info && info->ats_supported)
2078                                 translation = CONTEXT_TT_DEV_IOTLB;
2079                         else
2080                                 translation = CONTEXT_TT_MULTI_LEVEL;
2081
2082                         context_set_address_root(context, virt_to_phys(pgd));
2083                         context_set_address_width(context, agaw);
2084                 } else {
2085                         /*
2086                          * In pass through mode, AW must be programmed to
2087                          * indicate the largest AGAW value supported by
2088                          * hardware. And ASR is ignored by hardware.
2089                          */
2090                         context_set_address_width(context, iommu->msagaw);
2091                 }
2092
2093                 context_set_translation_type(context, translation);
2094         }
2095
2096         context_set_fault_enable(context);
2097         context_set_present(context);
2098         domain_flush_cache(domain, context, sizeof(*context));
2099
2100         /*
2101          * It's a non-present to present mapping. If hardware doesn't cache
2102          * non-present entry we only need to flush the write-buffer. If the
2103          * _does_ cache non-present entries, then it does so in the special
2104          * domain #0, which we have to flush:
2105          */
2106         if (cap_caching_mode(iommu->cap)) {
2107                 iommu->flush.flush_context(iommu, 0,
2108                                            (((u16)bus) << 8) | devfn,
2109                                            DMA_CCMD_MASK_NOBIT,
2110                                            DMA_CCMD_DEVICE_INVL);
2111                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2112         } else {
2113                 iommu_flush_write_buffer(iommu);
2114         }
2115         iommu_enable_dev_iotlb(info);
2116
2117         ret = 0;
2118
2119 out_unlock:
2120         spin_unlock(&iommu->lock);
2121         spin_unlock_irqrestore(&device_domain_lock, flags);
2122
2123         return ret;
2124 }
2125
2126 struct domain_context_mapping_data {
2127         struct dmar_domain *domain;
2128         struct intel_iommu *iommu;
2129         struct pasid_table *table;
2130 };
2131
2132 static int domain_context_mapping_cb(struct pci_dev *pdev,
2133                                      u16 alias, void *opaque)
2134 {
2135         struct domain_context_mapping_data *data = opaque;
2136
2137         return domain_context_mapping_one(data->domain, data->iommu,
2138                                           data->table, PCI_BUS_NUM(alias),
2139                                           alias & 0xff);
2140 }
2141
2142 static int
2143 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2144 {
2145         struct domain_context_mapping_data data;
2146         struct pasid_table *table;
2147         struct intel_iommu *iommu;
2148         u8 bus, devfn;
2149
2150         iommu = device_to_iommu(dev, &bus, &devfn);
2151         if (!iommu)
2152                 return -ENODEV;
2153
2154         table = intel_pasid_get_table(dev);
2155
2156         if (!dev_is_pci(dev))
2157                 return domain_context_mapping_one(domain, iommu, table,
2158                                                   bus, devfn);
2159
2160         data.domain = domain;
2161         data.iommu = iommu;
2162         data.table = table;
2163
2164         return pci_for_each_dma_alias(to_pci_dev(dev),
2165                                       &domain_context_mapping_cb, &data);
2166 }
2167
2168 static int domain_context_mapped_cb(struct pci_dev *pdev,
2169                                     u16 alias, void *opaque)
2170 {
2171         struct intel_iommu *iommu = opaque;
2172
2173         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2174 }
2175
2176 static int domain_context_mapped(struct device *dev)
2177 {
2178         struct intel_iommu *iommu;
2179         u8 bus, devfn;
2180
2181         iommu = device_to_iommu(dev, &bus, &devfn);
2182         if (!iommu)
2183                 return -ENODEV;
2184
2185         if (!dev_is_pci(dev))
2186                 return device_context_mapped(iommu, bus, devfn);
2187
2188         return !pci_for_each_dma_alias(to_pci_dev(dev),
2189                                        domain_context_mapped_cb, iommu);
2190 }
2191
2192 /* Returns a number of VTD pages, but aligned to MM page size */
2193 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2194                                             size_t size)
2195 {
2196         host_addr &= ~PAGE_MASK;
2197         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2198 }
2199
2200 /* Return largest possible superpage level for a given mapping */
2201 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2202                                           unsigned long iov_pfn,
2203                                           unsigned long phy_pfn,
2204                                           unsigned long pages)
2205 {
2206         int support, level = 1;
2207         unsigned long pfnmerge;
2208
2209         support = domain->iommu_superpage;
2210
2211         /* To use a large page, the virtual *and* physical addresses
2212            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2213            of them will mean we have to use smaller pages. So just
2214            merge them and check both at once. */
2215         pfnmerge = iov_pfn | phy_pfn;
2216
2217         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2218                 pages >>= VTD_STRIDE_SHIFT;
2219                 if (!pages)
2220                         break;
2221                 pfnmerge >>= VTD_STRIDE_SHIFT;
2222                 level++;
2223                 support--;
2224         }
2225         return level;
2226 }
2227
2228 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2229                             struct scatterlist *sg, unsigned long phys_pfn,
2230                             unsigned long nr_pages, int prot)
2231 {
2232         struct dma_pte *first_pte = NULL, *pte = NULL;
2233         phys_addr_t uninitialized_var(pteval);
2234         unsigned long sg_res = 0;
2235         unsigned int largepage_lvl = 0;
2236         unsigned long lvl_pages = 0;
2237
2238         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2239
2240         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2241                 return -EINVAL;
2242
2243         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2244
2245         if (!sg) {
2246                 sg_res = nr_pages;
2247                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2248         }
2249
2250         while (nr_pages > 0) {
2251                 uint64_t tmp;
2252
2253                 if (!sg_res) {
2254                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2255
2256                         sg_res = aligned_nrpages(sg->offset, sg->length);
2257                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2258                         sg->dma_length = sg->length;
2259                         pteval = (sg_phys(sg) - pgoff) | prot;
2260                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2261                 }
2262
2263                 if (!pte) {
2264                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2265
2266                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2267                         if (!pte)
2268                                 return -ENOMEM;
2269                         /* It is large page*/
2270                         if (largepage_lvl > 1) {
2271                                 unsigned long nr_superpages, end_pfn;
2272
2273                                 pteval |= DMA_PTE_LARGE_PAGE;
2274                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2275
2276                                 nr_superpages = sg_res / lvl_pages;
2277                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2278
2279                                 /*
2280                                  * Ensure that old small page tables are
2281                                  * removed to make room for superpage(s).
2282                                  * We're adding new large pages, so make sure
2283                                  * we don't remove their parent tables.
2284                                  */
2285                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2286                                                        largepage_lvl + 1);
2287                         } else {
2288                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2289                         }
2290
2291                 }
2292                 /* We don't need lock here, nobody else
2293                  * touches the iova range
2294                  */
2295                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2296                 if (tmp) {
2297                         static int dumps = 5;
2298                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2299                                 iov_pfn, tmp, (unsigned long long)pteval);
2300                         if (dumps) {
2301                                 dumps--;
2302                                 debug_dma_dump_mappings(NULL);
2303                         }
2304                         WARN_ON(1);
2305                 }
2306
2307                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2308
2309                 BUG_ON(nr_pages < lvl_pages);
2310                 BUG_ON(sg_res < lvl_pages);
2311
2312                 nr_pages -= lvl_pages;
2313                 iov_pfn += lvl_pages;
2314                 phys_pfn += lvl_pages;
2315                 pteval += lvl_pages * VTD_PAGE_SIZE;
2316                 sg_res -= lvl_pages;
2317
2318                 /* If the next PTE would be the first in a new page, then we
2319                    need to flush the cache on the entries we've just written.
2320                    And then we'll need to recalculate 'pte', so clear it and
2321                    let it get set again in the if (!pte) block above.
2322
2323                    If we're done (!nr_pages) we need to flush the cache too.
2324
2325                    Also if we've been setting superpages, we may need to
2326                    recalculate 'pte' and switch back to smaller pages for the
2327                    end of the mapping, if the trailing size is not enough to
2328                    use another superpage (i.e. sg_res < lvl_pages). */
2329                 pte++;
2330                 if (!nr_pages || first_pte_in_page(pte) ||
2331                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2332                         domain_flush_cache(domain, first_pte,
2333                                            (void *)pte - (void *)first_pte);
2334                         pte = NULL;
2335                 }
2336
2337                 if (!sg_res && nr_pages)
2338                         sg = sg_next(sg);
2339         }
2340         return 0;
2341 }
2342
2343 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2344                          struct scatterlist *sg, unsigned long phys_pfn,
2345                          unsigned long nr_pages, int prot)
2346 {
2347        int ret;
2348        struct intel_iommu *iommu;
2349
2350        /* Do the real mapping first */
2351        ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2352        if (ret)
2353                return ret;
2354
2355        /* Notify about the new mapping */
2356        if (domain_type_is_vm(domain)) {
2357                /* VM typed domains can have more than one IOMMUs */
2358                int iommu_id;
2359                for_each_domain_iommu(iommu_id, domain) {
2360                        iommu = g_iommus[iommu_id];
2361                        __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2362                }
2363        } else {
2364                /* General domains only have one IOMMU */
2365                iommu = domain_get_iommu(domain);
2366                __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2367        }
2368
2369        return 0;
2370 }
2371
2372 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2373                                     struct scatterlist *sg, unsigned long nr_pages,
2374                                     int prot)
2375 {
2376         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2377 }
2378
2379 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2380                                      unsigned long phys_pfn, unsigned long nr_pages,
2381                                      int prot)
2382 {
2383         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2384 }
2385
2386 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2387 {
2388         unsigned long flags;
2389         struct context_entry *context;
2390         u16 did_old;
2391
2392         if (!iommu)
2393                 return;
2394
2395         spin_lock_irqsave(&iommu->lock, flags);
2396         context = iommu_context_addr(iommu, bus, devfn, 0);
2397         if (!context) {
2398                 spin_unlock_irqrestore(&iommu->lock, flags);
2399                 return;
2400         }
2401         did_old = context_domain_id(context);
2402         context_clear_entry(context);
2403         __iommu_flush_cache(iommu, context, sizeof(*context));
2404         spin_unlock_irqrestore(&iommu->lock, flags);
2405         iommu->flush.flush_context(iommu,
2406                                    did_old,
2407                                    (((u16)bus) << 8) | devfn,
2408                                    DMA_CCMD_MASK_NOBIT,
2409                                    DMA_CCMD_DEVICE_INVL);
2410         iommu->flush.flush_iotlb(iommu,
2411                                  did_old,
2412                                  0,
2413                                  0,
2414                                  DMA_TLB_DSI_FLUSH);
2415 }
2416
2417 static inline void unlink_domain_info(struct device_domain_info *info)
2418 {
2419         assert_spin_locked(&device_domain_lock);
2420         list_del(&info->link);
2421         list_del(&info->global);
2422         if (info->dev)
2423                 info->dev->archdata.iommu = NULL;
2424 }
2425
2426 static void domain_remove_dev_info(struct dmar_domain *domain)
2427 {
2428         struct device_domain_info *info, *tmp;
2429         unsigned long flags;
2430
2431         spin_lock_irqsave(&device_domain_lock, flags);
2432         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2433                 __dmar_remove_one_dev_info(info);
2434         spin_unlock_irqrestore(&device_domain_lock, flags);
2435 }
2436
2437 /*
2438  * find_domain
2439  * Note: we use struct device->archdata.iommu stores the info
2440  */
2441 static struct dmar_domain *find_domain(struct device *dev)
2442 {
2443         struct device_domain_info *info;
2444
2445         /* No lock here, assumes no domain exit in normal case */
2446         info = dev->archdata.iommu;
2447         if (likely(info))
2448                 return info->domain;
2449         return NULL;
2450 }
2451
2452 static inline struct device_domain_info *
2453 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2454 {
2455         struct device_domain_info *info;
2456
2457         list_for_each_entry(info, &device_domain_list, global)
2458                 if (info->iommu->segment == segment && info->bus == bus &&
2459                     info->devfn == devfn)
2460                         return info;
2461
2462         return NULL;
2463 }
2464
2465 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2466                                                     int bus, int devfn,
2467                                                     struct device *dev,
2468                                                     struct dmar_domain *domain)
2469 {
2470         struct dmar_domain *found = NULL;
2471         struct device_domain_info *info;
2472         unsigned long flags;
2473         int ret;
2474
2475         info = alloc_devinfo_mem();
2476         if (!info)
2477                 return NULL;
2478
2479         info->bus = bus;
2480         info->devfn = devfn;
2481         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2482         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2483         info->ats_qdep = 0;
2484         info->dev = dev;
2485         info->domain = domain;
2486         info->iommu = iommu;
2487         info->pasid_table = NULL;
2488
2489         if (dev && dev_is_pci(dev)) {
2490                 struct pci_dev *pdev = to_pci_dev(info->dev);
2491
2492                 if (!pdev->untrusted &&
2493                     !pci_ats_disabled() &&
2494                     ecap_dev_iotlb_support(iommu->ecap) &&
2495                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2496                     dmar_find_matched_atsr_unit(pdev))
2497                         info->ats_supported = 1;
2498
2499                 if (sm_supported(iommu)) {
2500                         if (pasid_supported(iommu)) {
2501                                 int features = pci_pasid_features(pdev);
2502                                 if (features >= 0)
2503                                         info->pasid_supported = features | 1;
2504                         }
2505
2506                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2507                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2508                                 info->pri_supported = 1;
2509                 }
2510         }
2511
2512         spin_lock_irqsave(&device_domain_lock, flags);
2513         if (dev)
2514                 found = find_domain(dev);
2515
2516         if (!found) {
2517                 struct device_domain_info *info2;
2518                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2519                 if (info2) {
2520                         found      = info2->domain;
2521                         info2->dev = dev;
2522                 }
2523         }
2524
2525         if (found) {
2526                 spin_unlock_irqrestore(&device_domain_lock, flags);
2527                 free_devinfo_mem(info);
2528                 /* Caller must free the original domain */
2529                 return found;
2530         }
2531
2532         spin_lock(&iommu->lock);
2533         ret = domain_attach_iommu(domain, iommu);
2534         spin_unlock(&iommu->lock);
2535
2536         if (ret) {
2537                 spin_unlock_irqrestore(&device_domain_lock, flags);
2538                 free_devinfo_mem(info);
2539                 return NULL;
2540         }
2541
2542         list_add(&info->link, &domain->devices);
2543         list_add(&info->global, &device_domain_list);
2544         if (dev)
2545                 dev->archdata.iommu = info;
2546         spin_unlock_irqrestore(&device_domain_lock, flags);
2547
2548         /* PASID table is mandatory for a PCI device in scalable mode. */
2549         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2550                 ret = intel_pasid_alloc_table(dev);
2551                 if (ret) {
2552                         dev_err(dev, "PASID table allocation failed\n");
2553                         dmar_remove_one_dev_info(dev);
2554                         return NULL;
2555                 }
2556
2557                 /* Setup the PASID entry for requests without PASID: */
2558                 spin_lock(&iommu->lock);
2559                 if (hw_pass_through && domain_type_is_si(domain))
2560                         ret = intel_pasid_setup_pass_through(iommu, domain,
2561                                         dev, PASID_RID2PASID);
2562                 else
2563                         ret = intel_pasid_setup_second_level(iommu, domain,
2564                                         dev, PASID_RID2PASID);
2565                 spin_unlock(&iommu->lock);
2566                 if (ret) {
2567                         dev_err(dev, "Setup RID2PASID failed\n");
2568                         dmar_remove_one_dev_info(dev);
2569                         return NULL;
2570                 }
2571         }
2572
2573         if (dev && domain_context_mapping(domain, dev)) {
2574                 dev_err(dev, "Domain context map failed\n");
2575                 dmar_remove_one_dev_info(dev);
2576                 return NULL;
2577         }
2578
2579         return domain;
2580 }
2581
2582 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2583 {
2584         *(u16 *)opaque = alias;
2585         return 0;
2586 }
2587
2588 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2589 {
2590         struct device_domain_info *info;
2591         struct dmar_domain *domain = NULL;
2592         struct intel_iommu *iommu;
2593         u16 dma_alias;
2594         unsigned long flags;
2595         u8 bus, devfn;
2596
2597         iommu = device_to_iommu(dev, &bus, &devfn);
2598         if (!iommu)
2599                 return NULL;
2600
2601         if (dev_is_pci(dev)) {
2602                 struct pci_dev *pdev = to_pci_dev(dev);
2603
2604                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2605
2606                 spin_lock_irqsave(&device_domain_lock, flags);
2607                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2608                                                       PCI_BUS_NUM(dma_alias),
2609                                                       dma_alias & 0xff);
2610                 if (info) {
2611                         iommu = info->iommu;
2612                         domain = info->domain;
2613                 }
2614                 spin_unlock_irqrestore(&device_domain_lock, flags);
2615
2616                 /* DMA alias already has a domain, use it */
2617                 if (info)
2618                         goto out;
2619         }
2620
2621         /* Allocate and initialize new domain for the device */
2622         domain = alloc_domain(0);
2623         if (!domain)
2624                 return NULL;
2625         if (domain_init(domain, iommu, gaw)) {
2626                 domain_exit(domain);
2627                 return NULL;
2628         }
2629
2630 out:
2631
2632         return domain;
2633 }
2634
2635 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2636                                               struct dmar_domain *domain)
2637 {
2638         struct intel_iommu *iommu;
2639         struct dmar_domain *tmp;
2640         u16 req_id, dma_alias;
2641         u8 bus, devfn;
2642
2643         iommu = device_to_iommu(dev, &bus, &devfn);
2644         if (!iommu)
2645                 return NULL;
2646
2647         req_id = ((u16)bus << 8) | devfn;
2648
2649         if (dev_is_pci(dev)) {
2650                 struct pci_dev *pdev = to_pci_dev(dev);
2651
2652                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2653
2654                 /* register PCI DMA alias device */
2655                 if (req_id != dma_alias) {
2656                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2657                                         dma_alias & 0xff, NULL, domain);
2658
2659                         if (!tmp || tmp != domain)
2660                                 return tmp;
2661                 }
2662         }
2663
2664         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2665         if (!tmp || tmp != domain)
2666                 return tmp;
2667
2668         return domain;
2669 }
2670
2671 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2672 {
2673         struct dmar_domain *domain, *tmp;
2674
2675         domain = find_domain(dev);
2676         if (domain)
2677                 goto out;
2678
2679         domain = find_or_alloc_domain(dev, gaw);
2680         if (!domain)
2681                 goto out;
2682
2683         tmp = set_domain_for_dev(dev, domain);
2684         if (!tmp || domain != tmp) {
2685                 domain_exit(domain);
2686                 domain = tmp;
2687         }
2688
2689 out:
2690
2691         return domain;
2692 }
2693
2694 static int iommu_domain_identity_map(struct dmar_domain *domain,
2695                                      unsigned long long start,
2696                                      unsigned long long end)
2697 {
2698         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2699         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2700
2701         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2702                           dma_to_mm_pfn(last_vpfn))) {
2703                 pr_err("Reserving iova failed\n");
2704                 return -ENOMEM;
2705         }
2706
2707         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2708         /*
2709          * RMRR range might have overlap with physical memory range,
2710          * clear it first
2711          */
2712         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2713
2714         return __domain_mapping(domain, first_vpfn, NULL,
2715                                 first_vpfn, last_vpfn - first_vpfn + 1,
2716                                 DMA_PTE_READ|DMA_PTE_WRITE);
2717 }
2718
2719 static int domain_prepare_identity_map(struct device *dev,
2720                                        struct dmar_domain *domain,
2721                                        unsigned long long start,
2722                                        unsigned long long end)
2723 {
2724         /* For _hardware_ passthrough, don't bother. But for software
2725            passthrough, we do it anyway -- it may indicate a memory
2726            range which is reserved in E820, so which didn't get set
2727            up to start with in si_domain */
2728         if (domain == si_domain && hw_pass_through) {
2729                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2730                          start, end);
2731                 return 0;
2732         }
2733
2734         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2735
2736         if (end < start) {
2737                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2738                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2739                         dmi_get_system_info(DMI_BIOS_VENDOR),
2740                         dmi_get_system_info(DMI_BIOS_VERSION),
2741                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2742                 return -EIO;
2743         }
2744
2745         if (end >> agaw_to_width(domain->agaw)) {
2746                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2747                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2748                      agaw_to_width(domain->agaw),
2749                      dmi_get_system_info(DMI_BIOS_VENDOR),
2750                      dmi_get_system_info(DMI_BIOS_VERSION),
2751                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2752                 return -EIO;
2753         }
2754
2755         return iommu_domain_identity_map(domain, start, end);
2756 }
2757
2758 static int iommu_prepare_identity_map(struct device *dev,
2759                                       unsigned long long start,
2760                                       unsigned long long end)
2761 {
2762         struct dmar_domain *domain;
2763         int ret;
2764
2765         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2766         if (!domain)
2767                 return -ENOMEM;
2768
2769         ret = domain_prepare_identity_map(dev, domain, start, end);
2770         if (ret)
2771                 domain_exit(domain);
2772
2773         return ret;
2774 }
2775
2776 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2777                                          struct device *dev)
2778 {
2779         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2780                 return 0;
2781         return iommu_prepare_identity_map(dev, rmrr->base_address,
2782                                           rmrr->end_address);
2783 }
2784
2785 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2786 static inline void iommu_prepare_isa(void)
2787 {
2788         struct pci_dev *pdev;
2789         int ret;
2790
2791         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2792         if (!pdev)
2793                 return;
2794
2795         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2796         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2797
2798         if (ret)
2799                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2800
2801         pci_dev_put(pdev);
2802 }
2803 #else
2804 static inline void iommu_prepare_isa(void)
2805 {
2806         return;
2807 }
2808 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2809
2810 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2811
2812 static int __init si_domain_init(int hw)
2813 {
2814         int nid, ret;
2815
2816         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2817         if (!si_domain)
2818                 return -EFAULT;
2819
2820         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2821                 domain_exit(si_domain);
2822                 return -EFAULT;
2823         }
2824
2825         pr_debug("Identity mapping domain allocated\n");
2826
2827         if (hw)
2828                 return 0;
2829
2830         for_each_online_node(nid) {
2831                 unsigned long start_pfn, end_pfn;
2832                 int i;
2833
2834                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2835                         ret = iommu_domain_identity_map(si_domain,
2836                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2837                         if (ret)
2838                                 return ret;
2839                 }
2840         }
2841
2842         return 0;
2843 }
2844
2845 static int identity_mapping(struct device *dev)
2846 {
2847         struct device_domain_info *info;
2848
2849         if (likely(!iommu_identity_mapping))
2850                 return 0;
2851
2852         info = dev->archdata.iommu;
2853         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2854                 return (info->domain == si_domain);
2855
2856         return 0;
2857 }
2858
2859 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2860 {
2861         struct dmar_domain *ndomain;
2862         struct intel_iommu *iommu;
2863         u8 bus, devfn;
2864
2865         iommu = device_to_iommu(dev, &bus, &devfn);
2866         if (!iommu)
2867                 return -ENODEV;
2868
2869         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2870         if (ndomain != domain)
2871                 return -EBUSY;
2872
2873         return 0;
2874 }
2875
2876 static bool device_has_rmrr(struct device *dev)
2877 {
2878         struct dmar_rmrr_unit *rmrr;
2879         struct device *tmp;
2880         int i;
2881
2882         rcu_read_lock();
2883         for_each_rmrr_units(rmrr) {
2884                 /*
2885                  * Return TRUE if this RMRR contains the device that
2886                  * is passed in.
2887                  */
2888                 for_each_active_dev_scope(rmrr->devices,
2889                                           rmrr->devices_cnt, i, tmp)
2890                         if (tmp == dev) {
2891                                 rcu_read_unlock();
2892                                 return true;
2893                         }
2894         }
2895         rcu_read_unlock();
2896         return false;
2897 }
2898
2899 /*
2900  * There are a couple cases where we need to restrict the functionality of
2901  * devices associated with RMRRs.  The first is when evaluating a device for
2902  * identity mapping because problems exist when devices are moved in and out
2903  * of domains and their respective RMRR information is lost.  This means that
2904  * a device with associated RMRRs will never be in a "passthrough" domain.
2905  * The second is use of the device through the IOMMU API.  This interface
2906  * expects to have full control of the IOVA space for the device.  We cannot
2907  * satisfy both the requirement that RMRR access is maintained and have an
2908  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2909  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2910  * We therefore prevent devices associated with an RMRR from participating in
2911  * the IOMMU API, which eliminates them from device assignment.
2912  *
2913  * In both cases we assume that PCI USB devices with RMRRs have them largely
2914  * for historical reasons and that the RMRR space is not actively used post
2915  * boot.  This exclusion may change if vendors begin to abuse it.
2916  *
2917  * The same exception is made for graphics devices, with the requirement that
2918  * any use of the RMRR regions will be torn down before assigning the device
2919  * to a guest.
2920  */
2921 static bool device_is_rmrr_locked(struct device *dev)
2922 {
2923         if (!device_has_rmrr(dev))
2924                 return false;
2925
2926         if (dev_is_pci(dev)) {
2927                 struct pci_dev *pdev = to_pci_dev(dev);
2928
2929                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2930                         return false;
2931         }
2932
2933         return true;
2934 }
2935
2936 static int iommu_should_identity_map(struct device *dev, int startup)
2937 {
2938         if (dev_is_pci(dev)) {
2939                 struct pci_dev *pdev = to_pci_dev(dev);
2940
2941                 if (device_is_rmrr_locked(dev))
2942                         return 0;
2943
2944                 /*
2945                  * Prevent any device marked as untrusted from getting
2946                  * placed into the statically identity mapping domain.
2947                  */
2948                 if (pdev->untrusted)
2949                         return 0;
2950
2951                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2952                         return 1;
2953
2954                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2955                         return 1;
2956
2957                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2958                         return 0;
2959
2960                 /*
2961                  * We want to start off with all devices in the 1:1 domain, and
2962                  * take them out later if we find they can't access all of memory.
2963                  *
2964                  * However, we can't do this for PCI devices behind bridges,
2965                  * because all PCI devices behind the same bridge will end up
2966                  * with the same source-id on their transactions.
2967                  *
2968                  * Practically speaking, we can't change things around for these
2969                  * devices at run-time, because we can't be sure there'll be no
2970                  * DMA transactions in flight for any of their siblings.
2971                  *
2972                  * So PCI devices (unless they're on the root bus) as well as
2973                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2974                  * the 1:1 domain, just in _case_ one of their siblings turns out
2975                  * not to be able to map all of memory.
2976                  */
2977                 if (!pci_is_pcie(pdev)) {
2978                         if (!pci_is_root_bus(pdev->bus))
2979                                 return 0;
2980                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2981                                 return 0;
2982                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2983                         return 0;
2984         } else {
2985                 if (device_has_rmrr(dev))
2986                         return 0;
2987         }
2988
2989         /*
2990          * At boot time, we don't yet know if devices will be 64-bit capable.
2991          * Assume that they will — if they turn out not to be, then we can
2992          * take them out of the 1:1 domain later.
2993          */
2994         if (!startup) {
2995                 /*
2996                  * If the device's dma_mask is less than the system's memory
2997                  * size then this is not a candidate for identity mapping.
2998                  */
2999                 u64 dma_mask = *dev->dma_mask;
3000
3001                 if (dev->coherent_dma_mask &&
3002                     dev->coherent_dma_mask < dma_mask)
3003                         dma_mask = dev->coherent_dma_mask;
3004
3005                 return dma_mask >= dma_get_required_mask(dev);
3006         }
3007
3008         return 1;
3009 }
3010
3011 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
3012 {
3013         int ret;
3014
3015         if (!iommu_should_identity_map(dev, 1))
3016                 return 0;
3017
3018         ret = domain_add_dev_info(si_domain, dev);
3019         if (!ret)
3020                 dev_info(dev, "%s identity mapping\n",
3021                          hw ? "Hardware" : "Software");
3022         else if (ret == -ENODEV)
3023                 /* device not associated with an iommu */
3024                 ret = 0;
3025
3026         return ret;
3027 }
3028
3029
3030 static int __init iommu_prepare_static_identity_mapping(int hw)
3031 {
3032         struct pci_dev *pdev = NULL;
3033         struct dmar_drhd_unit *drhd;
3034         struct intel_iommu *iommu;
3035         struct device *dev;
3036         int i;
3037         int ret = 0;
3038
3039         for_each_pci_dev(pdev) {
3040                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
3041                 if (ret)
3042                         return ret;
3043         }
3044
3045         for_each_active_iommu(iommu, drhd)
3046                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
3047                         struct acpi_device_physical_node *pn;
3048                         struct acpi_device *adev;
3049
3050                         if (dev->bus != &acpi_bus_type)
3051                                 continue;
3052
3053                         adev= to_acpi_device(dev);
3054                         mutex_lock(&adev->physical_node_lock);
3055                         list_for_each_entry(pn, &adev->physical_node_list, node) {
3056                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3057                                 if (ret)
3058                                         break;
3059                         }
3060                         mutex_unlock(&adev->physical_node_lock);
3061                         if (ret)
3062                                 return ret;
3063                 }
3064
3065         return 0;
3066 }
3067
3068 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3069 {
3070         /*
3071          * Start from the sane iommu hardware state.
3072          * If the queued invalidation is already initialized by us
3073          * (for example, while enabling interrupt-remapping) then
3074          * we got the things already rolling from a sane state.
3075          */
3076         if (!iommu->qi) {
3077                 /*
3078                  * Clear any previous faults.
3079                  */
3080                 dmar_fault(-1, iommu);
3081                 /*
3082                  * Disable queued invalidation if supported and already enabled
3083                  * before OS handover.
3084                  */
3085                 dmar_disable_qi(iommu);
3086         }
3087
3088         if (dmar_enable_qi(iommu)) {
3089                 /*
3090                  * Queued Invalidate not enabled, use Register Based Invalidate
3091                  */
3092                 iommu->flush.flush_context = __iommu_flush_context;
3093                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3094                 pr_info("%s: Using Register based invalidation\n",
3095                         iommu->name);
3096         } else {
3097                 iommu->flush.flush_context = qi_flush_context;
3098                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3099                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3100         }
3101 }
3102
3103 static int copy_context_table(struct intel_iommu *iommu,
3104                               struct root_entry *old_re,
3105                               struct context_entry **tbl,
3106                               int bus, bool ext)
3107 {
3108         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3109         struct context_entry *new_ce = NULL, ce;
3110         struct context_entry *old_ce = NULL;
3111         struct root_entry re;
3112         phys_addr_t old_ce_phys;
3113
3114         tbl_idx = ext ? bus * 2 : bus;
3115         memcpy(&re, old_re, sizeof(re));
3116
3117         for (devfn = 0; devfn < 256; devfn++) {
3118                 /* First calculate the correct index */
3119                 idx = (ext ? devfn * 2 : devfn) % 256;
3120
3121                 if (idx == 0) {
3122                         /* First save what we may have and clean up */
3123                         if (new_ce) {
3124                                 tbl[tbl_idx] = new_ce;
3125                                 __iommu_flush_cache(iommu, new_ce,
3126                                                     VTD_PAGE_SIZE);
3127                                 pos = 1;
3128                         }
3129
3130                         if (old_ce)
3131                                 memunmap(old_ce);
3132
3133                         ret = 0;
3134                         if (devfn < 0x80)
3135                                 old_ce_phys = root_entry_lctp(&re);
3136                         else
3137                                 old_ce_phys = root_entry_uctp(&re);
3138
3139                         if (!old_ce_phys) {
3140                                 if (ext && devfn == 0) {
3141                                         /* No LCTP, try UCTP */
3142                                         devfn = 0x7f;
3143                                         continue;
3144                                 } else {
3145                                         goto out;
3146                                 }
3147                         }
3148
3149                         ret = -ENOMEM;
3150                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3151                                         MEMREMAP_WB);
3152                         if (!old_ce)
3153                                 goto out;
3154
3155                         new_ce = alloc_pgtable_page(iommu->node);
3156                         if (!new_ce)
3157                                 goto out_unmap;
3158
3159                         ret = 0;
3160                 }
3161
3162                 /* Now copy the context entry */
3163                 memcpy(&ce, old_ce + idx, sizeof(ce));
3164
3165                 if (!__context_present(&ce))
3166                         continue;
3167
3168                 did = context_domain_id(&ce);
3169                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3170                         set_bit(did, iommu->domain_ids);
3171
3172                 /*
3173                  * We need a marker for copied context entries. This
3174                  * marker needs to work for the old format as well as
3175                  * for extended context entries.
3176                  *
3177                  * Bit 67 of the context entry is used. In the old
3178                  * format this bit is available to software, in the
3179                  * extended format it is the PGE bit, but PGE is ignored
3180                  * by HW if PASIDs are disabled (and thus still
3181                  * available).
3182                  *
3183                  * So disable PASIDs first and then mark the entry
3184                  * copied. This means that we don't copy PASID
3185                  * translations from the old kernel, but this is fine as
3186                  * faults there are not fatal.
3187                  */
3188                 context_clear_pasid_enable(&ce);
3189                 context_set_copied(&ce);
3190
3191                 new_ce[idx] = ce;
3192         }
3193
3194         tbl[tbl_idx + pos] = new_ce;
3195
3196         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3197
3198 out_unmap:
3199         memunmap(old_ce);
3200
3201 out:
3202         return ret;
3203 }
3204
3205 static int copy_translation_tables(struct intel_iommu *iommu)
3206 {
3207         struct context_entry **ctxt_tbls;
3208         struct root_entry *old_rt;
3209         phys_addr_t old_rt_phys;
3210         int ctxt_table_entries;
3211         unsigned long flags;
3212         u64 rtaddr_reg;
3213         int bus, ret;
3214         bool new_ext, ext;
3215
3216         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3217         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3218         new_ext    = !!ecap_ecs(iommu->ecap);
3219
3220         /*
3221          * The RTT bit can only be changed when translation is disabled,
3222          * but disabling translation means to open a window for data
3223          * corruption. So bail out and don't copy anything if we would
3224          * have to change the bit.
3225          */
3226         if (new_ext != ext)
3227                 return -EINVAL;
3228
3229         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3230         if (!old_rt_phys)
3231                 return -EINVAL;
3232
3233         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3234         if (!old_rt)
3235                 return -ENOMEM;
3236
3237         /* This is too big for the stack - allocate it from slab */
3238         ctxt_table_entries = ext ? 512 : 256;
3239         ret = -ENOMEM;
3240         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3241         if (!ctxt_tbls)
3242                 goto out_unmap;
3243
3244         for (bus = 0; bus < 256; bus++) {
3245                 ret = copy_context_table(iommu, &old_rt[bus],
3246                                          ctxt_tbls, bus, ext);
3247                 if (ret) {
3248                         pr_err("%s: Failed to copy context table for bus %d\n",
3249                                 iommu->name, bus);
3250                         continue;
3251                 }
3252         }
3253
3254         spin_lock_irqsave(&iommu->lock, flags);
3255
3256         /* Context tables are copied, now write them to the root_entry table */
3257         for (bus = 0; bus < 256; bus++) {
3258                 int idx = ext ? bus * 2 : bus;
3259                 u64 val;
3260
3261                 if (ctxt_tbls[idx]) {
3262                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3263                         iommu->root_entry[bus].lo = val;
3264                 }
3265
3266                 if (!ext || !ctxt_tbls[idx + 1])
3267                         continue;
3268
3269                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3270                 iommu->root_entry[bus].hi = val;
3271         }
3272
3273         spin_unlock_irqrestore(&iommu->lock, flags);
3274
3275         kfree(ctxt_tbls);
3276
3277         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3278
3279         ret = 0;
3280
3281 out_unmap:
3282         memunmap(old_rt);
3283
3284         return ret;
3285 }
3286
3287 static int __init init_dmars(void)
3288 {
3289         struct dmar_drhd_unit *drhd;
3290         struct dmar_rmrr_unit *rmrr;
3291         bool copied_tables = false;
3292         struct device *dev;
3293         struct intel_iommu *iommu;
3294         int i, ret;
3295
3296         /*
3297          * for each drhd
3298          *    allocate root
3299          *    initialize and program root entry to not present
3300          * endfor
3301          */
3302         for_each_drhd_unit(drhd) {
3303                 /*
3304                  * lock not needed as this is only incremented in the single
3305                  * threaded kernel __init code path all other access are read
3306                  * only
3307                  */
3308                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3309                         g_num_of_iommus++;
3310                         continue;
3311                 }
3312                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3313         }
3314
3315         /* Preallocate enough resources for IOMMU hot-addition */
3316         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3317                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3318
3319         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3320                         GFP_KERNEL);
3321         if (!g_iommus) {
3322                 pr_err("Allocating global iommu array failed\n");
3323                 ret = -ENOMEM;
3324                 goto error;
3325         }
3326
3327         for_each_active_iommu(iommu, drhd) {
3328                 /*
3329                  * Find the max pasid size of all IOMMU's in the system.
3330                  * We need to ensure the system pasid table is no bigger
3331                  * than the smallest supported.
3332                  */
3333                 if (pasid_supported(iommu)) {
3334                         u32 temp = 2 << ecap_pss(iommu->ecap);
3335
3336                         intel_pasid_max_id = min_t(u32, temp,
3337                                                    intel_pasid_max_id);
3338                 }
3339
3340                 g_iommus[iommu->seq_id] = iommu;
3341
3342                 intel_iommu_init_qi(iommu);
3343
3344                 ret = iommu_init_domains(iommu);
3345                 if (ret)
3346                         goto free_iommu;
3347
3348                 init_translation_status(iommu);
3349
3350                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3351                         iommu_disable_translation(iommu);
3352                         clear_translation_pre_enabled(iommu);
3353                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3354                                 iommu->name);
3355                 }
3356
3357                 /*
3358                  * TBD:
3359                  * we could share the same root & context tables
3360                  * among all IOMMU's. Need to Split it later.
3361                  */
3362                 ret = iommu_alloc_root_entry(iommu);
3363                 if (ret)
3364                         goto free_iommu;
3365
3366                 if (translation_pre_enabled(iommu)) {
3367                         pr_info("Translation already enabled - trying to copy translation structures\n");
3368
3369                         ret = copy_translation_tables(iommu);
3370                         if (ret) {
3371                                 /*
3372                                  * We found the IOMMU with translation
3373                                  * enabled - but failed to copy over the
3374                                  * old root-entry table. Try to proceed
3375                                  * by disabling translation now and
3376                                  * allocating a clean root-entry table.
3377                                  * This might cause DMAR faults, but
3378                                  * probably the dump will still succeed.
3379                                  */
3380                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3381                                        iommu->name);
3382                                 iommu_disable_translation(iommu);
3383                                 clear_translation_pre_enabled(iommu);
3384                         } else {
3385                                 pr_info("Copied translation tables from previous kernel for %s\n",
3386                                         iommu->name);
3387                                 copied_tables = true;
3388                         }
3389                 }
3390
3391                 if (!ecap_pass_through(iommu->ecap))
3392                         hw_pass_through = 0;
3393 #ifdef CONFIG_INTEL_IOMMU_SVM
3394                 if (pasid_supported(iommu))
3395                         intel_svm_init(iommu);
3396 #endif
3397         }
3398
3399         /*
3400          * Now that qi is enabled on all iommus, set the root entry and flush
3401          * caches. This is required on some Intel X58 chipsets, otherwise the
3402          * flush_context function will loop forever and the boot hangs.
3403          */
3404         for_each_active_iommu(iommu, drhd) {
3405                 iommu_flush_write_buffer(iommu);
3406                 iommu_set_root_entry(iommu);
3407                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3408                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3409         }
3410
3411         if (iommu_pass_through)
3412                 iommu_identity_mapping |= IDENTMAP_ALL;
3413
3414 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3415         iommu_identity_mapping |= IDENTMAP_GFX;
3416 #endif
3417
3418         check_tylersburg_isoch();
3419
3420         if (iommu_identity_mapping) {
3421                 ret = si_domain_init(hw_pass_through);
3422                 if (ret)
3423                         goto free_iommu;
3424         }
3425
3426
3427         /*
3428          * If we copied translations from a previous kernel in the kdump
3429          * case, we can not assign the devices to domains now, as that
3430          * would eliminate the old mappings. So skip this part and defer
3431          * the assignment to device driver initialization time.
3432          */
3433         if (copied_tables)
3434                 goto domains_done;
3435
3436         /*
3437          * If pass through is not set or not enabled, setup context entries for
3438          * identity mappings for rmrr, gfx, and isa and may fall back to static
3439          * identity mapping if iommu_identity_mapping is set.
3440          */
3441         if (iommu_identity_mapping) {
3442                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3443                 if (ret) {
3444                         pr_crit("Failed to setup IOMMU pass-through\n");
3445                         goto free_iommu;
3446                 }
3447         }
3448         /*
3449          * For each rmrr
3450          *   for each dev attached to rmrr
3451          *   do
3452          *     locate drhd for dev, alloc domain for dev
3453          *     allocate free domain
3454          *     allocate page table entries for rmrr
3455          *     if context not allocated for bus
3456          *           allocate and init context
3457          *           set present in root table for this bus
3458          *     init context with domain, translation etc
3459          *    endfor
3460          * endfor
3461          */
3462         pr_info("Setting RMRR:\n");
3463         for_each_rmrr_units(rmrr) {
3464                 /* some BIOS lists non-exist devices in DMAR table. */
3465                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3466                                           i, dev) {
3467                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3468                         if (ret)
3469                                 pr_err("Mapping reserved region failed\n");
3470                 }
3471         }
3472
3473         iommu_prepare_isa();
3474
3475 domains_done:
3476
3477         /*
3478          * for each drhd
3479          *   enable fault log
3480          *   global invalidate context cache
3481          *   global invalidate iotlb
3482          *   enable translation
3483          */
3484         for_each_iommu(iommu, drhd) {
3485                 if (drhd->ignored) {
3486                         /*
3487                          * we always have to disable PMRs or DMA may fail on
3488                          * this device
3489                          */
3490                         if (force_on)
3491                                 iommu_disable_protect_mem_regions(iommu);
3492                         continue;
3493                 }
3494
3495                 iommu_flush_write_buffer(iommu);
3496
3497 #ifdef CONFIG_INTEL_IOMMU_SVM
3498                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3499                         ret = intel_svm_enable_prq(iommu);
3500                         if (ret)
3501                                 goto free_iommu;
3502                 }
3503 #endif
3504                 ret = dmar_set_interrupt(iommu);
3505                 if (ret)
3506                         goto free_iommu;
3507
3508                 if (!translation_pre_enabled(iommu))
3509                         iommu_enable_translation(iommu);
3510
3511                 iommu_disable_protect_mem_regions(iommu);
3512         }
3513
3514         return 0;
3515
3516 free_iommu:
3517         for_each_active_iommu(iommu, drhd) {
3518                 disable_dmar_iommu(iommu);
3519                 free_dmar_iommu(iommu);
3520         }
3521
3522         kfree(g_iommus);
3523
3524 error:
3525         return ret;
3526 }
3527
3528 /* This takes a number of _MM_ pages, not VTD pages */
3529 static unsigned long intel_alloc_iova(struct device *dev,
3530                                      struct dmar_domain *domain,
3531                                      unsigned long nrpages, uint64_t dma_mask)
3532 {
3533         unsigned long iova_pfn;
3534
3535         /* Restrict dma_mask to the width that the iommu can handle */
3536         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3537         /* Ensure we reserve the whole size-aligned region */
3538         nrpages = __roundup_pow_of_two(nrpages);
3539
3540         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3541                 /*
3542                  * First try to allocate an io virtual address in
3543                  * DMA_BIT_MASK(32) and if that fails then try allocating
3544                  * from higher range
3545                  */
3546                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3547                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3548                 if (iova_pfn)
3549                         return iova_pfn;
3550         }
3551         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3552                                    IOVA_PFN(dma_mask), true);
3553         if (unlikely(!iova_pfn)) {
3554                 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3555                 return 0;
3556         }
3557
3558         return iova_pfn;
3559 }
3560
3561 struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3562 {
3563         struct dmar_domain *domain, *tmp;
3564         struct dmar_rmrr_unit *rmrr;
3565         struct device *i_dev;
3566         int i, ret;
3567
3568         domain = find_domain(dev);
3569         if (domain)
3570                 goto out;
3571
3572         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3573         if (!domain)
3574                 goto out;
3575
3576         /* We have a new domain - setup possible RMRRs for the device */
3577         rcu_read_lock();
3578         for_each_rmrr_units(rmrr) {
3579                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3580                                           i, i_dev) {
3581                         if (i_dev != dev)
3582                                 continue;
3583
3584                         ret = domain_prepare_identity_map(dev, domain,
3585                                                           rmrr->base_address,
3586                                                           rmrr->end_address);
3587                         if (ret)
3588                                 dev_err(dev, "Mapping reserved region failed\n");
3589                 }
3590         }
3591         rcu_read_unlock();
3592
3593         tmp = set_domain_for_dev(dev, domain);
3594         if (!tmp || domain != tmp) {
3595                 domain_exit(domain);
3596                 domain = tmp;
3597         }
3598
3599 out:
3600
3601         if (!domain)
3602                 dev_err(dev, "Allocating domain failed\n");
3603
3604
3605         return domain;
3606 }
3607
3608 /* Check if the dev needs to go through non-identity map and unmap process.*/
3609 static int iommu_no_mapping(struct device *dev)
3610 {
3611         int found;
3612
3613         if (iommu_dummy(dev))
3614                 return 1;
3615
3616         if (!iommu_identity_mapping)
3617                 return 0;
3618
3619         found = identity_mapping(dev);
3620         if (found) {
3621                 if (iommu_should_identity_map(dev, 0))
3622                         return 1;
3623                 else {
3624                         /*
3625                          * 32 bit DMA is removed from si_domain and fall back
3626                          * to non-identity mapping.
3627                          */
3628                         dmar_remove_one_dev_info(dev);
3629                         dev_info(dev, "32bit DMA uses non-identity mapping\n");
3630                         return 0;
3631                 }
3632         } else {
3633                 /*
3634                  * In case of a detached 64 bit DMA device from vm, the device
3635                  * is put into si_domain for identity mapping.
3636                  */
3637                 if (iommu_should_identity_map(dev, 0)) {
3638                         int ret;
3639                         ret = domain_add_dev_info(si_domain, dev);
3640                         if (!ret) {
3641                                 dev_info(dev, "64bit DMA uses identity mapping\n");
3642                                 return 1;
3643                         }
3644                 }
3645         }
3646
3647         return 0;
3648 }
3649
3650 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3651                                      size_t size, int dir, u64 dma_mask)
3652 {
3653         struct dmar_domain *domain;
3654         phys_addr_t start_paddr;
3655         unsigned long iova_pfn;
3656         int prot = 0;
3657         int ret;
3658         struct intel_iommu *iommu;
3659         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3660
3661         BUG_ON(dir == DMA_NONE);
3662
3663         if (iommu_no_mapping(dev))
3664                 return paddr;
3665
3666         domain = get_valid_domain_for_dev(dev);
3667         if (!domain)
3668                 return DMA_MAPPING_ERROR;
3669
3670         iommu = domain_get_iommu(domain);
3671         size = aligned_nrpages(paddr, size);
3672
3673         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3674         if (!iova_pfn)
3675                 goto error;
3676
3677         /*
3678          * Check if DMAR supports zero-length reads on write only
3679          * mappings..
3680          */
3681         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3682                         !cap_zlr(iommu->cap))
3683                 prot |= DMA_PTE_READ;
3684         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3685                 prot |= DMA_PTE_WRITE;
3686         /*
3687          * paddr - (paddr + size) might be partial page, we should map the whole
3688          * page.  Note: if two part of one page are separately mapped, we
3689          * might have two guest_addr mapping to the same host paddr, but this
3690          * is not a big problem
3691          */
3692         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3693                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3694         if (ret)
3695                 goto error;
3696
3697         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3698         start_paddr += paddr & ~PAGE_MASK;
3699         return start_paddr;
3700
3701 error:
3702         if (iova_pfn)
3703                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3704         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3705                 size, (unsigned long long)paddr, dir);
3706         return DMA_MAPPING_ERROR;
3707 }
3708
3709 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3710                                  unsigned long offset, size_t size,
3711                                  enum dma_data_direction dir,
3712                                  unsigned long attrs)
3713 {
3714         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3715                                   dir, *dev->dma_mask);
3716 }
3717
3718 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3719                                      size_t size, enum dma_data_direction dir,
3720                                      unsigned long attrs)
3721 {
3722         return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3723 }
3724
3725 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3726 {
3727         struct dmar_domain *domain;
3728         unsigned long start_pfn, last_pfn;
3729         unsigned long nrpages;
3730         unsigned long iova_pfn;
3731         struct intel_iommu *iommu;
3732         struct page *freelist;
3733
3734         if (iommu_no_mapping(dev))
3735                 return;
3736
3737         domain = find_domain(dev);
3738         BUG_ON(!domain);
3739
3740         iommu = domain_get_iommu(domain);
3741
3742         iova_pfn = IOVA_PFN(dev_addr);
3743
3744         nrpages = aligned_nrpages(dev_addr, size);
3745         start_pfn = mm_to_dma_pfn(iova_pfn);
3746         last_pfn = start_pfn + nrpages - 1;
3747
3748         dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3749
3750         freelist = domain_unmap(domain, start_pfn, last_pfn);
3751
3752         if (intel_iommu_strict) {
3753                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3754                                       nrpages, !freelist, 0);
3755                 /* free iova */
3756                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3757                 dma_free_pagelist(freelist);
3758         } else {
3759                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3760                            (unsigned long)freelist);
3761                 /*
3762                  * queue up the release of the unmap to save the 1/6th of the
3763                  * cpu used up by the iotlb flush operation...
3764                  */
3765         }
3766 }
3767
3768 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3769                              size_t size, enum dma_data_direction dir,
3770                              unsigned long attrs)
3771 {
3772         intel_unmap(dev, dev_addr, size);
3773 }
3774
3775 static void *intel_alloc_coherent(struct device *dev, size_t size,
3776                                   dma_addr_t *dma_handle, gfp_t flags,
3777                                   unsigned long attrs)
3778 {
3779         struct page *page = NULL;
3780         int order;
3781
3782         size = PAGE_ALIGN(size);
3783         order = get_order(size);
3784
3785         if (!iommu_no_mapping(dev))
3786                 flags &= ~(GFP_DMA | GFP_DMA32);
3787         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3788                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3789                         flags |= GFP_DMA;
3790                 else
3791                         flags |= GFP_DMA32;
3792         }
3793
3794         if (gfpflags_allow_blocking(flags)) {
3795                 unsigned int count = size >> PAGE_SHIFT;
3796
3797                 page = dma_alloc_from_contiguous(dev, count, order,
3798                                                  flags & __GFP_NOWARN);
3799                 if (page && iommu_no_mapping(dev) &&
3800                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3801                         dma_release_from_contiguous(dev, page, count);
3802                         page = NULL;
3803                 }
3804         }
3805
3806         if (!page)
3807                 page = alloc_pages(flags, order);
3808         if (!page)
3809                 return NULL;
3810         memset(page_address(page), 0, size);
3811
3812         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3813                                          DMA_BIDIRECTIONAL,
3814                                          dev->coherent_dma_mask);
3815         if (*dma_handle != DMA_MAPPING_ERROR)
3816                 return page_address(page);
3817         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3818                 __free_pages(page, order);
3819
3820         return NULL;
3821 }
3822
3823 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3824                                 dma_addr_t dma_handle, unsigned long attrs)
3825 {
3826         int order;
3827         struct page *page = virt_to_page(vaddr);
3828
3829         size = PAGE_ALIGN(size);
3830         order = get_order(size);
3831
3832         intel_unmap(dev, dma_handle, size);
3833         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3834                 __free_pages(page, order);
3835 }
3836
3837 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3838                            int nelems, enum dma_data_direction dir,
3839                            unsigned long attrs)
3840 {
3841         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3842         unsigned long nrpages = 0;
3843         struct scatterlist *sg;
3844         int i;
3845
3846         for_each_sg(sglist, sg, nelems, i) {
3847                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3848         }
3849
3850         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3851 }
3852
3853 static int intel_nontranslate_map_sg(struct device *hddev,
3854         struct scatterlist *sglist, int nelems, int dir)
3855 {
3856         int i;
3857         struct scatterlist *sg;
3858
3859         for_each_sg(sglist, sg, nelems, i) {
3860                 BUG_ON(!sg_page(sg));
3861                 sg->dma_address = sg_phys(sg);
3862                 sg->dma_length = sg->length;
3863         }
3864         return nelems;
3865 }
3866
3867 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3868                         enum dma_data_direction dir, unsigned long attrs)
3869 {
3870         int i;
3871         struct dmar_domain *domain;
3872         size_t size = 0;
3873         int prot = 0;
3874         unsigned long iova_pfn;
3875         int ret;
3876         struct scatterlist *sg;
3877         unsigned long start_vpfn;
3878         struct intel_iommu *iommu;
3879
3880         BUG_ON(dir == DMA_NONE);
3881         if (iommu_no_mapping(dev))
3882                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3883
3884         domain = get_valid_domain_for_dev(dev);
3885         if (!domain)
3886                 return 0;
3887
3888         iommu = domain_get_iommu(domain);
3889
3890         for_each_sg(sglist, sg, nelems, i)
3891                 size += aligned_nrpages(sg->offset, sg->length);
3892
3893         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3894                                 *dev->dma_mask);
3895         if (!iova_pfn) {
3896                 sglist->dma_length = 0;
3897                 return 0;
3898         }
3899
3900         /*
3901          * Check if DMAR supports zero-length reads on write only
3902          * mappings..
3903          */
3904         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3905                         !cap_zlr(iommu->cap))
3906                 prot |= DMA_PTE_READ;