#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
void parallel_matrix_multiply(double **A, double **B, double **C, int n) {
#pragma omp parallel for collapse(2)
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
double sum = 0.0;
for (int k = 0; k < n; k++) {
sum += A[i][k] * B[k][j];
}
C[i][j] = sum;
}
}
}
int main() {
int n = 1000;
double **A, **B, **C;
// Allocate matrices...
double start = omp_get_wtime();
parallel_matrix_multiply(A, B, C, n);
double end = omp_get_wtime();
printf("Time: %f seconds\n", end - start);
return 0;
}
__global__ void matrix_multiply_kernel(float *A, float *B, float *C, int n) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < n && col < n) {
float sum = 0.0f;
for (int k = 0; k < n; k++) {
sum += A[row * n + k] * B[k * n + col];
}
C[row * n + col] = sum;
}
}
void gpu_matrix_multiply(float *h_A, float *h_B, float *h_C, int n) {
float *d_A, *d_B, *d_C;
size_t size = n * n * sizeof(float);
// Allocate GPU memory
cudaMalloc(&d_A, size);
cudaMalloc(&d_B, size);
cudaMalloc(&d_C, size);
// Copy data to GPU
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// Launch kernel
dim3 threadsPerBlock(16, 16);
dim3 numBlocks((n + 15) / 16, (n + 15) / 16);
matrix_multiply_kernel<<>>(d_A, d_B, d_C, n);
// Copy result back
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Cleanup
cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
}
import numpy as np
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import multiprocessing as mp
def chunk_multiply(args):
"""Multiply chunks of matrices in parallel"""
A_chunk, B, start_row, end_row = args
return np.dot(A_chunk, B)
def parallel_matrix_multiply(A, B, num_processes=None):
if num_processes is None:
num_processes = mp.cpu_count()
n = A.shape[0]
chunk_size = n // num_processes
# Create chunks
chunks = []
for i in range(num_processes):
start_row = i * chunk_size
end_row = start_row + chunk_size if i < num_processes - 1 else n
A_chunk = A[start_row:end_row]
chunks.append((A_chunk, B, start_row, end_row))
# Parallel computation
with ProcessPoolExecutor(max_workers=num_processes) as executor:
results = list(executor.map(chunk_multiply, chunks))
# Combine results
return np.vstack(results)
# Usage
A = np.random.rand(1000, 1000)
B = np.random.rand(1000, 1000)
C = parallel_matrix_multiply(A, B)
import java.util.concurrent.*;
import java.util.stream.IntStream;
public class ParallelMatrixMultiply {
public static double[][] multiply(double[][] A, double[][] B) {
int n = A.length;
double[][] C = new double[n][n];
// Using parallel streams
IntStream.range(0, n).parallel().forEach(i -> {
IntStream.range(0, n).parallel().forEach(j -> {
C[i][j] = IntStream.range(0, n)
.mapToDouble(k -> A[i][k] * B[k][j])
.sum();
});
});
return C;
}
// Alternative using ForkJoinPool
public static class MatrixTask extends RecursiveTask<Void> {
private final double[][] A, B, C;
private final int startRow, endRow, n;
private static final int THRESHOLD = 64;
public MatrixTask(double[][] A, double[][] B, double[][] C,
int startRow, int endRow, int n) {
this.A = A; this.B = B; this.C = C;
this.startRow = startRow; this.endRow = endRow; this.n = n;
}
@Override
protected Void compute() {
if (endRow - startRow <= THRESHOLD) {
// Direct computation for small tasks
for (int i = startRow; i < endRow; i++) {
for (int j = 0; j < n; j++) {
double sum = 0.0;
for (int k = 0; k < n; k++) {
sum += A[i][k] * B[k][j];
}
C[i][j] = sum;
}
}
} else {
// Split task
int mid = (startRow + endRow) / 2;
MatrixTask task1 = new MatrixTask(A, B, C, startRow, mid, n);
MatrixTask task2 = new MatrixTask(A, B, C, mid, endRow, n);
invokeAll(task1, task2);
}
return null;
}
}
}
package main
import (
"fmt"
"runtime"
"sync"
"time"
)
func parallelMatrixMultiply(A, B [][]float64) [][]float64 {
n := len(A)
C := make([][]float64, n)
for i := range C {
C[i] = make([]float64, n)
}
numWorkers := runtime.NumCPU()
rowsPerWorker := n / numWorkers
var wg sync.WaitGroup
for worker := 0; worker < numWorkers; worker++ {
wg.Add(1)
go func(startRow int) {
defer wg.Done()
endRow := startRow + rowsPerWorker
if endRow > n {
endRow = n
}
for i := startRow; i < endRow; i++ {
for j := 0; j < n; j++ {
var sum float64
for k := 0; k < n; k++ {
sum += A[i][k] * B[k][j]
}
C[i][j] = sum
}
}
}(worker * rowsPerWorker)
}
wg.Wait()
return C
}
func main() {
n := 1000
A := make([][]float64, n)
B := make([][]float64, n)
// Initialize matrices...
start := time.Now()
C := parallelMatrixMultiply(A, B)
elapsed := time.Since(start)
fmt.Printf("Parallel computation took %v\n", elapsed)
}