Linux-libre 5.3.12-gnu
[librecmc/linux-libre.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_xgmi.c
1 /*
2  * Copyright 2018 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  *
23  */
24 #include <linux/list.h>
25 #include "amdgpu.h"
26 #include "amdgpu_xgmi.h"
27 #include "amdgpu_smu.h"
28
29
30 static DEFINE_MUTEX(xgmi_mutex);
31
32 #define AMDGPU_MAX_XGMI_HIVE                    8
33 #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE         4
34
35 static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE];
36 static unsigned hive_count = 0;
37
38 void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
39 {
40         return &hive->device_list;
41 }
42
43 /**
44  * DOC: AMDGPU XGMI Support
45  *
46  * XGMI is a high speed interconnect that joins multiple GPU cards
47  * into a homogeneous memory space that is organized by a collective
48  * hive ID and individual node IDs, both of which are 64-bit numbers.
49  *
50  * The file xgmi_device_id contains the unique per GPU device ID and
51  * is stored in the /sys/class/drm/card${cardno}/device/ directory.
52  *
53  * Inside the device directory a sub-directory 'xgmi_hive_info' is
54  * created which contains the hive ID and the list of nodes.
55  *
56  * The hive ID is stored in:
57  *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id
58  *
59  * The node information is stored in numbered directories:
60  *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id
61  *
62  * Each device has their own xgmi_hive_info direction with a mirror
63  * set of node sub-directories.
64  *
65  * The XGMI memory space is built by contiguously adding the power of
66  * two padded VRAM space from each node to each other.
67  *
68  */
69
70
71 static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev,
72                 struct device_attribute *attr, char *buf)
73 {
74         struct amdgpu_hive_info *hive =
75                         container_of(attr, struct amdgpu_hive_info, dev_attr);
76
77         return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
78 }
79
80 static int amdgpu_xgmi_sysfs_create(struct amdgpu_device *adev,
81                                     struct amdgpu_hive_info *hive)
82 {
83         int ret = 0;
84
85         if (WARN_ON(hive->kobj))
86                 return -EINVAL;
87
88         hive->kobj = kobject_create_and_add("xgmi_hive_info", &adev->dev->kobj);
89         if (!hive->kobj) {
90                 dev_err(adev->dev, "XGMI: Failed to allocate sysfs entry!\n");
91                 return -EINVAL;
92         }
93
94         hive->dev_attr = (struct device_attribute) {
95                 .attr = {
96                         .name = "xgmi_hive_id",
97                         .mode = S_IRUGO,
98
99                 },
100                 .show = amdgpu_xgmi_show_hive_id,
101         };
102
103         ret = sysfs_create_file(hive->kobj, &hive->dev_attr.attr);
104         if (ret) {
105                 dev_err(adev->dev, "XGMI: Failed to create device file xgmi_hive_id\n");
106                 kobject_del(hive->kobj);
107                 kobject_put(hive->kobj);
108                 hive->kobj = NULL;
109         }
110
111         return ret;
112 }
113
114 static void amdgpu_xgmi_sysfs_destroy(struct amdgpu_device *adev,
115                                     struct amdgpu_hive_info *hive)
116 {
117         sysfs_remove_file(hive->kobj, &hive->dev_attr.attr);
118         kobject_del(hive->kobj);
119         kobject_put(hive->kobj);
120         hive->kobj = NULL;
121 }
122
123 static ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
124                                      struct device_attribute *attr,
125                                      char *buf)
126 {
127         struct drm_device *ddev = dev_get_drvdata(dev);
128         struct amdgpu_device *adev = ddev->dev_private;
129
130         return snprintf(buf, PAGE_SIZE, "%llu\n", adev->gmc.xgmi.node_id);
131
132 }
133
134
135 static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
136
137
138 static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
139                                          struct amdgpu_hive_info *hive)
140 {
141         int ret = 0;
142         char node[10] = { 0 };
143
144         /* Create xgmi device id file */
145         ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id);
146         if (ret) {
147                 dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n");
148                 return ret;
149         }
150
151         /* Create sysfs link to hive info folder on the first device */
152         if (adev != hive->adev) {
153                 ret = sysfs_create_link(&adev->dev->kobj, hive->kobj,
154                                         "xgmi_hive_info");
155                 if (ret) {
156                         dev_err(adev->dev, "XGMI: Failed to create link to hive info");
157                         goto remove_file;
158                 }
159         }
160
161         sprintf(node, "node%d", hive->number_devices);
162         /* Create sysfs link form the hive folder to yourself */
163         ret = sysfs_create_link(hive->kobj, &adev->dev->kobj, node);
164         if (ret) {
165                 dev_err(adev->dev, "XGMI: Failed to create link from hive info");
166                 goto remove_link;
167         }
168
169         goto success;
170
171
172 remove_link:
173         sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique);
174
175 remove_file:
176         device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
177
178 success:
179         return ret;
180 }
181
182 static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
183                                           struct amdgpu_hive_info *hive)
184 {
185         device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
186         sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique);
187         sysfs_remove_link(hive->kobj, adev->ddev->unique);
188 }
189
190
191
192 struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
193 {
194         int i;
195         struct amdgpu_hive_info *tmp;
196
197         if (!adev->gmc.xgmi.hive_id)
198                 return NULL;
199
200         mutex_lock(&xgmi_mutex);
201
202         for (i = 0 ; i < hive_count; ++i) {
203                 tmp = &xgmi_hives[i];
204                 if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
205                         if (lock)
206                                 mutex_lock(&tmp->hive_lock);
207                         mutex_unlock(&xgmi_mutex);
208                         return tmp;
209                 }
210         }
211         if (i >= AMDGPU_MAX_XGMI_HIVE) {
212                 mutex_unlock(&xgmi_mutex);
213                 return NULL;
214         }
215
216         /* initialize new hive if not exist */
217         tmp = &xgmi_hives[hive_count++];
218
219         if (amdgpu_xgmi_sysfs_create(adev, tmp)) {
220                 mutex_unlock(&xgmi_mutex);
221                 return NULL;
222         }
223
224         tmp->adev = adev;
225         tmp->hive_id = adev->gmc.xgmi.hive_id;
226         INIT_LIST_HEAD(&tmp->device_list);
227         mutex_init(&tmp->hive_lock);
228         mutex_init(&tmp->reset_lock);
229
230         if (lock)
231                 mutex_lock(&tmp->hive_lock);
232         tmp->pstate = -1;
233         mutex_unlock(&xgmi_mutex);
234
235         return tmp;
236 }
237
238 int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
239 {
240         int ret = 0;
241         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
242
243         if (!hive)
244                 return 0;
245
246         if (hive->pstate == pstate)
247                 return 0;
248
249         dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate);
250
251         if (is_support_sw_smu(adev))
252                 ret = smu_set_xgmi_pstate(&adev->smu, pstate);
253         if (ret)
254                 dev_err(adev->dev,
255                         "XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
256                         adev->gmc.xgmi.node_id,
257                         adev->gmc.xgmi.hive_id, ret);
258
259         return ret;
260 }
261
262 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
263 {
264         int ret = -EINVAL;
265
266         /* Each psp need to set the latest topology */
267         ret = psp_xgmi_set_topology_info(&adev->psp,
268                                          hive->number_devices,
269                                          &adev->psp.xgmi_context.top_info);
270         if (ret)
271                 dev_err(adev->dev,
272                         "XGMI: Set topology failure on device %llx, hive %llx, ret %d",
273                         adev->gmc.xgmi.node_id,
274                         adev->gmc.xgmi.hive_id, ret);
275
276         return ret;
277 }
278
279
280 int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
281                 struct amdgpu_device *peer_adev)
282 {
283         struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
284         int i;
285
286         for (i = 0 ; i < top->num_nodes; ++i)
287                 if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
288                         return top->nodes[i].num_hops;
289         return  -EINVAL;
290 }
291
292 int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
293 {
294         struct psp_xgmi_topology_info *top_info;
295         struct amdgpu_hive_info *hive;
296         struct amdgpu_xgmi      *entry;
297         struct amdgpu_device *tmp_adev = NULL;
298
299         int count = 0, ret = -EINVAL;
300
301         if (!adev->gmc.xgmi.supported)
302                 return 0;
303
304         ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id);
305         if (ret) {
306                 dev_err(adev->dev,
307                         "XGMI: Failed to get node id\n");
308                 return ret;
309         }
310
311         ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id);
312         if (ret) {
313                 dev_err(adev->dev,
314                         "XGMI: Failed to get hive id\n");
315                 return ret;
316         }
317
318         hive = amdgpu_get_xgmi_hive(adev, 1);
319         if (!hive) {
320                 ret = -EINVAL;
321                 dev_err(adev->dev,
322                         "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",
323                         adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id);
324                 goto exit;
325         }
326
327         top_info = &adev->psp.xgmi_context.top_info;
328
329         list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
330         list_for_each_entry(entry, &hive->device_list, head)
331                 top_info->nodes[count++].node_id = entry->node_id;
332         top_info->num_nodes = count;
333         hive->number_devices = count;
334
335         list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
336                 /* update node list for other device in the hive */
337                 if (tmp_adev != adev) {
338                         top_info = &tmp_adev->psp.xgmi_context.top_info;
339                         top_info->nodes[count - 1].node_id = adev->gmc.xgmi.node_id;
340                         top_info->num_nodes = count;
341                 }
342                 ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
343                 if (ret)
344                         goto exit;
345         }
346
347         /* get latest topology info for each device from psp */
348         list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
349                 ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
350                                 &tmp_adev->psp.xgmi_context.top_info);
351                 if (ret) {
352                         dev_err(tmp_adev->dev,
353                                 "XGMI: Get topology failure on device %llx, hive %llx, ret %d",
354                                 tmp_adev->gmc.xgmi.node_id,
355                                 tmp_adev->gmc.xgmi.hive_id, ret);
356                         /* To do : continue with some node failed or disable the whole hive */
357                         goto exit;
358                 }
359         }
360
361         if (!ret)
362                 ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
363
364
365         mutex_unlock(&hive->hive_lock);
366 exit:
367         if (!ret)
368                 dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
369                          adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id);
370         else
371                 dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",
372                         adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,
373                         ret);
374
375         return ret;
376 }
377
378 void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
379 {
380         struct amdgpu_hive_info *hive;
381
382         if (!adev->gmc.xgmi.supported)
383                 return;
384
385         hive = amdgpu_get_xgmi_hive(adev, 1);
386         if (!hive)
387                 return;
388
389         if (!(hive->number_devices--)) {
390                 amdgpu_xgmi_sysfs_destroy(adev, hive);
391                 mutex_destroy(&hive->hive_lock);
392                 mutex_destroy(&hive->reset_lock);
393         } else {
394                 amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
395                 mutex_unlock(&hive->hive_lock);
396         }
397 }